In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.datasets import fetch_openml
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### MNIST

In [None]:
mnist = fetch_openml('mnist_784')

X = mnist.data
y = mnist.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#### Logistic Regression

In [None]:
lr = LogisticRegression(penalty='l2').fit(X_train, y_train)
y_pred = lr.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix: \n\n', cm) 

Confusion Matrix: 

 [[1348    0    3    2    2   11   14    1    6    0]
 [   0 1541    6    5    2    4    1    5   14    2]
 [   6   17 1294   19   20    4   16   13   50    4]
 [   5    5   50 1261    1   43    4   13   39   14]
 [   2    4    9    2 1259    2   15   11    6   40]
 [  11    5   12   46   13 1053   22    7   49   13]
 [  13    5   14    1   16   20 1310    3    5    0]
 [   3    6   21    6   15    1    1 1347    7   51]
 [   4   34   11   31    9   38   16    2 1210   13]
 [   4    4   10   16   49    7    1   39   13 1218]]


In [None]:
cr = classification_report(y_test, y_pred)
print('Classification Report: \n\n', cr)

Classification Report: 

               precision    recall  f1-score   support

           0       0.97      0.97      0.97      1387
           1       0.95      0.98      0.96      1580
           2       0.90      0.90      0.90      1443
           3       0.91      0.88      0.89      1435
           4       0.91      0.93      0.92      1350
           5       0.89      0.86      0.87      1231
           6       0.94      0.94      0.94      1387
           7       0.93      0.92      0.93      1458
           8       0.86      0.88      0.87      1368
           9       0.90      0.89      0.90      1361

    accuracy                           0.92     14000
   macro avg       0.92      0.92      0.92     14000
weighted avg       0.92      0.92      0.92     14000



In [None]:
importances = pd.DataFrame(data={
    'Attribute': np.arange(0,784,1),
    'Importance': lr.coef_[0]
})

importances = importances.sort_values(by='Importance', ascending=False)

In [None]:
importances.head(30).reset_index().drop('index', axis=1)

Unnamed: 0,Attribute,Importance
0,629,0.0043
1,360,0.00425
2,240,0.003677
3,388,0.003566
4,416,0.00338
5,275,0.00307
6,276,0.002845
7,190,0.002781
8,247,0.002706
9,387,0.002575


#### Decision Tree

In [None]:
dtc = DecisionTreeClassifier().fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix: \n\n', cm) 

Confusion Matrix: 

 [[1279    4   14    6   15   18   29    4   10    8]
 [   0 1515   12   17    8    3    2    9   13    1]
 [  19   19 1221   36   18    7   22   31   50   20]
 [   7   15   51 1180   12   56   10   28   48   28]
 [   6    3   13   10 1182    7   19   10   27   73]
 [  22   16   22   56   12  997   25   19   41   21]
 [  26    5   18    6   22   27 1235    6   27   15]
 [  12   10   32   16   22    9    2 1307   10   38]
 [  20   26   43   46   25   34   31    9 1081   53]
 [   9    6   11   13   63   27   13   40   28 1151]]


In [None]:
cr = classification_report(y_test, y_pred)
print('Classification Report: \n\n', cr)

Classification Report: 

               precision    recall  f1-score   support

           0       0.91      0.92      0.92      1387
           1       0.94      0.96      0.95      1580
           2       0.85      0.85      0.85      1443
           3       0.85      0.82      0.84      1435
           4       0.86      0.88      0.87      1350
           5       0.84      0.81      0.83      1231
           6       0.89      0.89      0.89      1387
           7       0.89      0.90      0.89      1458
           8       0.81      0.79      0.80      1368
           9       0.82      0.85      0.83      1361

    accuracy                           0.87     14000
   macro avg       0.87      0.87      0.87     14000
weighted avg       0.87      0.87      0.87     14000



### 20NG

In [2]:
train_data = fetch_20newsgroups(subset='train')
train_df = pd.DataFrame({
    'Data': train_data.data,
    'Group': train_data.target
})

test_data = fetch_20newsgroups(subset='test')
test_df = pd.DataFrame({
    'Data': test_data.data,
    'Group': test_data.target
})

# Taking a sample of 100 articles for each group for train
train_df = train_df.groupby('Group', group_keys=False).apply(pd.DataFrame.sample, n=50)

# Taking a sample of 50 articles for each group for train
test_df = test_df.groupby('Group', group_keys=False).apply(pd.DataFrame.sample, n=25)

# Random shuffling dataframe
train_df = train_df.sample(frac=1)
test_df = test_df.sample(frac=1)

# Vectorizing the articles
vectorizer = TfidfVectorizer()
vectors_train = vectorizer.fit_transform(train_df.Data)
vectors_test = vectorizer.transform(test_df.Data)
n = vectors_train.shape[1]

vec_train = vectors_train.toarray()
vec_test = vectors_test.toarray()

vec_train = vec_train.tolist()
X_train = pd.DataFrame(vec_train, columns=np.arange(0,n,1))
y_train = train_df['Group']

vec_test = vec_test.tolist()
X_test = pd.DataFrame(vec_test, columns=np.arange(0,n,1))
y_test = test_df['Group']

#### Logistic Regression

In [6]:
lr = LogisticRegression(penalty='l2').fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [7]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix: \n\n', cm) 

Confusion Matrix: 

 [[18  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  1  5]
 [ 3 16  1  1  1  0  1  0  0  0  0  2  0  0  0  0  0  0  0  0]
 [ 0  1 18  2  1  0  1  0  1  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  2  2 13  2  1  3  1  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  3 16  1  3  1  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  3  4  1  0 15  1  0  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  1  1  0  0 21  1  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0 20  0  0  0  0  0  1  3  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  3 19  0  0  0  0  1  0  0  1  0  0  0]
 [ 3  0  0  0  1  0  1  0  0 13  6  0  0  0  0  0  0  1  0  0]
 [ 0  0  0  0  0  0  1  0  1  1 20  0  0  0  2  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 23  0  0  0  0  2  0  0  0]
 [ 0  1  0  1  2  0  1  2  0  0  0  0 14  0  3  0  1  0  0  0]
 [ 0  4  0  0  0  1  3  2  1  1  0  0  0  9  0  1  0  0  1  2]
 [ 0  1  0  0  0  1  0  0  0  0  0  0  0  0 19  2  0  0  1  1]
 [ 1  0  0  0  0  0  0  0  0  0  0

In [8]:
cr = classification_report(y_test, y_pred)
print('Classification Report: \n\n', cr)

Classification Report: 

               precision    recall  f1-score   support

           0       0.56      0.72      0.63        25
           1       0.55      0.64      0.59        25
           2       0.69      0.72      0.71        25
           3       0.57      0.52      0.54        25
           4       0.70      0.64      0.67        25
           5       0.79      0.60      0.68        25
           6       0.57      0.84      0.68        25
           7       0.67      0.80      0.73        25
           8       0.83      0.76      0.79        25
           9       0.72      0.52      0.60        25
          10       0.71      0.80      0.75        25
          11       0.88      0.92      0.90        25
          12       0.93      0.56      0.70        25
          13       0.75      0.36      0.49        25
          14       0.70      0.76      0.73        25
          15       0.64      0.72      0.68        25
          16       0.59      0.88      0.71        25
 

In [10]:
importances = pd.DataFrame(data={
    'Attribute': np.arange(0,n,1),
    'Importance': lr.coef_[0]
})

importances = importances.sort_values(by='Importance', ascending=False)

In [11]:
importances.head(30).reset_index().drop('index', axis=1)

Unnamed: 0,Attribute,Importance
0,5602,1.158345
1,14578,1.114523
2,13738,1.093198
3,13181,1.052489
4,11168,1.049761
5,20847,0.978029
6,22864,0.902438
7,16891,0.873379
8,13199,0.860602
9,3977,0.839118


#### Decision Tree

In [12]:
dtc = DecisionTreeClassifier().fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [13]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix: \n\n', cm) 

Confusion Matrix: 

 [[ 9  3  0  0  6  0  0  0  1  0  1  0  0  1  0  0  0  1  2  1]
 [ 0  3  2  6  1  1  0  0  2  0  1  2  2  2  0  0  1  0  1  1]
 [ 0  1 11  2  1  3  0  0  0  0  0  2  2  1  0  0  1  1  0  0]
 [ 0  1  4  9  2  1  0  0  0  0  2  1  1  0  0  1  1  0  2  0]
 [ 0  1  1  1 17  3  0  0  1  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  1  9  6  1  3  0  0  0  0  1  0  3  0  1  0  0  0  0  0]
 [ 1  2  0  0  2  0 15  0  0  2  0  2  0  0  0  0  0  0  1  0]
 [ 0  0  0  3  1  0  0 16  1  0  0  1  1  0  0  0  0  0  2  0]
 [ 0  0  0  1  2  0  1  2  9  0  1  0  2  0  1  0  1  1  0  4]
 [ 2  0  0  0  1  0  2  0  0 11  5  0  0  1  0  0  0  0  2  1]
 [ 3  1  2  3  0  0  1  0  1  3  8  0  0  1  0  0  1  0  1  0]
 [ 0  1  0  1  0  1  0  0  1  0  0 10  2  0  0  2  2  0  4  1]
 [ 0  4  0  7  1  1  1  3  1  0  0  1  2  0  2  0  0  0  1  1]
 [ 1  3  1  3  2  2  0  1  0  0  0  1  5  1  1  1  0  0  1  2]
 [ 0  2  1  1  2  0  0  0  1  0  2  1  1  0  9  0  0  1  2  2]
 [ 4  0  0  0  1  1  0  0  0  2  0

In [14]:
cr = classification_report(y_test, y_pred)
print('Classification Report: \n\n', cr)

Classification Report: 

               precision    recall  f1-score   support

           0       0.30      0.36      0.33        25
           1       0.10      0.12      0.11        25
           2       0.34      0.44      0.39        25
           3       0.20      0.36      0.26        25
           4       0.40      0.68      0.50        25
           5       0.19      0.12      0.15        25
           6       0.71      0.60      0.65        25
           7       0.64      0.64      0.64        25
           8       0.45      0.36      0.40        25
           9       0.61      0.44      0.51        25
          10       0.36      0.32      0.34        25
          11       0.43      0.40      0.42        25
          12       0.08      0.08      0.08        25
          13       0.08      0.04      0.05        25
          14       0.64      0.36      0.46        25
          15       0.52      0.44      0.48        25
          16       0.48      0.56      0.52        25
 

### Spambase

In [18]:
cols = [
    'word_freq_make',
    'word_freq_address',
    'word_freq_all', 
    'word_freq_3d',  
    'word_freq_our', 
    'word_freq_over',
    'word_freq_remove', 
    'word_freq_internet',
    'word_freq_order',  
    'word_freq_mail',
    'word_freq_receive',
    'word_freq_will',
    'word_freq_people', 
    'word_freq_report', 
    'word_freq_addresses',
    'word_freq_free',
    'word_freq_business',
    'word_freq_email',  
    'word_freq_you', 
    'word_freq_credit', 
    'word_freq_your',
    'word_freq_font',
    'word_freq_000', 
    'word_freq_money',  
    'word_freq_hp',  
    'word_freq_hpl', 
    'word_freq_george', 
    'word_freq_650', 
    'word_freq_lab', 
    'word_freq_labs',
    'word_freq_telnet', 
    'word_freq_857', 
    'word_freq_data',
    'word_freq_415', 
    'word_freq_85',  
    'word_freq_technology',
    'word_freq_1999',
    'word_freq_parts',  
    'word_freq_pm',  
    'word_freq_direct', 
    'word_freq_cs',  
    'word_freq_meeting',
    'word_freq_original',
    'word_freq_project',
    'word_freq_re',  
    'word_freq_edu', 
    'word_freq_table',  
    'word_freq_conference',
    'char_freq_;',   
    'char_freq_(',   
    'char_freq_[',   
    'char_freq_!',   
    'char_freq_$',   
    'char_freq_#',   
    'capital_run_length_average',
    'capital_run_length_longest',
    'capital_run_length_total',
    'type'
]

spam_df = pd.read_csv('/content/drive/MyDrive/UML/HW3/spambase.csv', names=cols)

# Stratified sampling

# Taking 80% data for training
spam_train = spam_df.groupby('type', group_keys=False).apply(lambda x: x.sample(frac=0.8))

# Taking remaining 20% for testing
spam_test = spam_df.drop(spam_train.index)

# Random shuffling dataframe
spam_train = spam_train.sample(frac=1)
spam_test = spam_test.sample(frac=1)

X_train = spam_train.loc[:, spam_train.columns != 'type']
y_train = spam_train.loc[:, spam_train.columns == 'type']

X_test = spam_test.loc[:, spam_test.columns != 'type']
y_test = spam_test.loc[:, spam_test.columns == 'type']

std = StandardScaler()
X_train_std = std.fit_transform(X_train)
X_test_std = std.transform(X_test)

#### Logistic Regression

In [19]:
lr = LogisticRegression(penalty='l2').fit(X_train, y_train)
y_pred = lr.predict(X_test)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix: \n\n', cm) 

Confusion Matrix: 

 [[525  33]
 [ 38 325]]


In [21]:
cr = classification_report(y_test, y_pred)
print('Classification Report: \n\n', cr)

Classification Report: 

               precision    recall  f1-score   support

           0       0.93      0.94      0.94       558
           1       0.91      0.90      0.90       363

    accuracy                           0.92       921
   macro avg       0.92      0.92      0.92       921
weighted avg       0.92      0.92      0.92       921



In [24]:
importances = pd.DataFrame(data={
    'Attribute': np.arange(0,57,1),
    'Importance': lr.coef_[0]
})

importances = importances.sort_values(by='Importance', ascending=False)

In [25]:
importances.head(30).reset_index().drop('index', axis=1)

Unnamed: 0,Attribute,Importance
0,15,1.117107
1,6,1.028335
2,22,0.872694
3,4,0.860798
4,16,0.686596
5,7,0.642088
6,52,0.620654
7,23,0.589563
8,5,0.436109
9,19,0.425914


#### Decision Tree

In [26]:
dtc = DecisionTreeClassifier().fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [27]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix: \n\n', cm) 

Confusion Matrix: 

 [[521  37]
 [ 48 315]]


In [28]:
cr = classification_report(y_test, y_pred)
print('Classification Report: \n\n', cr)

Classification Report: 

               precision    recall  f1-score   support

           0       0.92      0.93      0.92       558
           1       0.89      0.87      0.88       363

    accuracy                           0.91       921
   macro avg       0.91      0.90      0.90       921
weighted avg       0.91      0.91      0.91       921

