In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

In [None]:
def run_LogisticRegression(X_train_red, y_train, X_test_red, y_test):

    lr = LogisticRegression(penalty='l2').fit(X_train_red, y_train)
    y_pred = lr.predict(X_test_red)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)

    return cm, cr

In [None]:
def run_DecisionTreeClassifier(X_train_red, y_train, X_test_red, y_test):

    lr = DecisionTreeClassifier().fit(X_train_red, y_train)
    y_pred = lr.predict(X_test_red)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)

    return cm, cr

In [None]:
train_data = fetch_20newsgroups(subset='train')
train_df = pd.DataFrame({
    'Data': train_data.data,
    'Group': train_data.target
})

test_data = fetch_20newsgroups(subset='test')
test_df = pd.DataFrame({
    'Data': test_data.data,
    'Group': test_data.target
})

# Taking a sample of 100 articles for each group for train
train_df = train_df.groupby('Group', group_keys=False).apply(pd.DataFrame.sample, n=50)

# Taking a sample of 50 articles for each group for train
test_df = test_df.groupby('Group', group_keys=False).apply(pd.DataFrame.sample, n=25)

# Random shuffling dataframe
train_df = train_df.sample(frac=1)
test_df = test_df.sample(frac=1)

# Vectorizing the articles
vectorizer = TfidfVectorizer()
vectors_train = vectorizer.fit_transform(train_df.Data)
vectors_test = vectorizer.transform(test_df.Data)
n = vectors_train.shape[1]

vec_train = vectors_train.toarray()
vec_test = vectors_test.toarray()

vec_train = vec_train.tolist()
X_train = pd.DataFrame(vec_train, columns=np.arange(0,n,1))
y_train = train_df['Group']

vec_test = vec_test.tolist()
X_test = pd.DataFrame(vec_test, columns=np.arange(0,n,1))
y_test = test_df['Group']

### Chi-Square

In [None]:
ch = SelectKBest(score_func=chi2, k=200)
X_train_red = ch.fit_transform(X_train, y_train)
X_test_red = ch.transform(X_test)

#### Logistic Regression

In [None]:
cm, cr = run_LogisticRegression(X_train_red, y_train, X_test_red, y_test)

In [None]:
print('Confusion Matrix: \n\n', cm) 

Confusion Matrix: 

 [[11  0  0  0  0  0  0  0  0  2  1  0  2  0  0  8  0  0  0  1]
 [ 0 10  3  2  0  1  0  0  0  0  0  0  7  0  2  0  0  0  0  0]
 [ 0  0 19  0  1  0  0  0  0  0  0  0  5  0  0  0  0  0  0  0]
 [ 0  2  7 11  0  0  0  0  0  0  1  1  3  0  0  0  0  0  0  0]
 [ 0  0  1  4 13  1  1  0  0  1  0  0  3  0  1  0  0  0  0  0]
 [ 0  2  5  0  0 10  0  0  0  0  0  0  8  0  0  0  0  0  0  0]
 [ 0  1  0  4  0  0 15  1  0  0  1  1  2  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 17  0  0  0  0  7  0  0  0  1  0  0  0]
 [ 0  0  0  1  0  0  2  1 15  2  0  0  2  0  0  2  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0 11 10  0  2  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1 22  0  0  0  0  1  1  0  0  0]
 [ 0  0  1  0  0  0  0  0  0  0  0 21  3  0  0  0  0  0  0  0]
 [ 0  0  0  3  0  0  1  1  1  1  1  4 12  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  2  0  0 12  8  1  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  6  0 19  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0

In [None]:
print('Classification Report: \n\n', cr)

Classification Report: 

               precision    recall  f1-score   support

           0       0.65      0.44      0.52        25
           1       0.67      0.40      0.50        25
           2       0.51      0.76      0.61        25
           3       0.44      0.44      0.44        25
           4       0.87      0.52      0.65        25
           5       0.83      0.40      0.54        25
           6       0.68      0.60      0.64        25
           7       0.81      0.68      0.74        25
           8       0.94      0.60      0.73        25
           9       0.37      0.44      0.40        25
          10       0.59      0.88      0.71        25
          11       0.78      0.84      0.81        25
          12       0.13      0.48      0.20        25
          13       0.73      0.32      0.44        25
          14       0.76      0.76      0.76        25
          15       0.50      0.84      0.63        25
          16       0.52      0.48      0.50        25
 

#### Decision Tree

In [None]:
cm, cr = run_DecisionTreeClassifier(X_train_red, y_train, X_test_red, y_test)

In [None]:
print('Confusion Matrix: \n\n', cm) 

Confusion Matrix: 

 [[11  0  0  0  1  0  0  0  0  0  0  0  1  0  0  5  0  0  2  5]
 [ 0 11  1  0  1  1  0  0  0  0  0  0  8  0  3  0  0  0  0  0]
 [ 0  2 16  1  0  0  0  0  0  0  0  0  6  0  0  0  0  0  0  0]
 [ 0  2  4  8  3  0  1  0  0  0  0  0  6  0  1  0  0  0  0  0]
 [ 0  2  1  3 10  0  2  0  0  0  0  0  3  0  0  0  1  0  3  0]
 [ 0  1  6  0  0  8  0  0  0  0  0  0 10  0  0  0  0  0  0  0]
 [ 0  1  0  1  0  0 17  0  0  0  2  0  4  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1 13  0  0  0  0  7  0  0  0  1  2  1  0]
 [ 0  0  0  1  1  0  2  1 14  1  0  0  4  0  0  0  1  0  0  0]
 [ 0  0  1  0  0  0  3  0  0 11  1  0  2  1  0  0  0  0  2  4]
 [ 0  0  0  0  0  0  4  0  1  2 16  0  0  0  0  0  1  0  0  1]
 [ 0  1  0  0  0  0  0  0  0  0  0 17  6  0  0  0  0  0  1  0]
 [ 0  1  0  3  0  1  0  1  0  1  0  0 14  1  1  1  0  0  1  0]
 [ 0  0  1  0  0  2  2  0  0  0  0  0 12  7  0  0  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  7  0 18  0  0  0  0  0]
 [ 4  0  0  0  0  0  0  0  2  0  0

In [None]:
print('Classification Report: \n\n', cr)

Classification Report: 

               precision    recall  f1-score   support

           0       0.55      0.44      0.49        25
           1       0.50      0.44      0.47        25
           2       0.50      0.64      0.56        25
           3       0.47      0.32      0.38        25
           4       0.53      0.40      0.45        25
           5       0.67      0.32      0.43        25
           6       0.52      0.68      0.59        25
           7       0.81      0.52      0.63        25
           8       0.74      0.56      0.64        25
           9       0.52      0.44      0.48        25
          10       0.84      0.64      0.73        25
          11       1.00      0.68      0.81        25
          12       0.12      0.56      0.20        25
          13       0.47      0.28      0.35        25
          14       0.72      0.72      0.72        25
          15       0.55      0.48      0.51        25
          16       0.33      0.24      0.28        25
 

### Mutual Information

In [None]:
mi = SelectKBest(score_func=mutual_info_classif, k=200)
X_train_red = mi.fit_transform(X_train, y_train)
X_test_red = mi.transform(X_test)

#### Logistic Regression

In [None]:
cm, cr = run_LogisticRegression(X_train_red, y_train, X_test_red, y_test)

In [None]:
print('Confusion Matrix: \n\n', cm) 

Confusion Matrix: 

 [[11  0  0  0  0  0  0  0  0  2  1  0  2  0  0  8  0  0  0  1]
 [ 0 10  3  2  0  1  0  0  0  0  0  0  7  0  2  0  0  0  0  0]
 [ 0  0 19  0  1  0  0  0  0  0  0  0  5  0  0  0  0  0  0  0]
 [ 0  2  7 11  0  0  0  0  0  0  1  1  3  0  0  0  0  0  0  0]
 [ 0  0  1  4 13  1  1  0  0  1  0  0  3  0  1  0  0  0  0  0]
 [ 0  2  5  0  0 10  0  0  0  0  0  0  8  0  0  0  0  0  0  0]
 [ 0  1  0  4  0  0 15  1  0  0  1  1  2  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 17  0  0  0  0  7  0  0  0  1  0  0  0]
 [ 0  0  0  1  0  0  2  1 15  2  0  0  2  0  0  2  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0 11 10  0  2  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1 22  0  0  0  0  1  1  0  0  0]
 [ 0  0  1  0  0  0  0  0  0  0  0 21  3  0  0  0  0  0  0  0]
 [ 0  0  0  3  0  0  1  1  1  1  1  4 12  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  2  0  0 12  8  1  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  6  0 19  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0

In [None]:
print('Classification Report: \n\n', cr)

Classification Report: 

               precision    recall  f1-score   support

           0       0.65      0.44      0.52        25
           1       0.67      0.40      0.50        25
           2       0.51      0.76      0.61        25
           3       0.44      0.44      0.44        25
           4       0.87      0.52      0.65        25
           5       0.83      0.40      0.54        25
           6       0.68      0.60      0.64        25
           7       0.81      0.68      0.74        25
           8       0.94      0.60      0.73        25
           9       0.37      0.44      0.40        25
          10       0.59      0.88      0.71        25
          11       0.78      0.84      0.81        25
          12       0.13      0.48      0.20        25
          13       0.73      0.32      0.44        25
          14       0.76      0.76      0.76        25
          15       0.50      0.84      0.63        25
          16       0.52      0.48      0.50        25
 

#### Decision Tree

In [None]:
cm, cr = run_DecisionTreeClassifier(X_train_red, y_train, X_test_red, y_test)

In [None]:
print('Confusion Matrix: \n\n', cm) 

Confusion Matrix: 

 [[11  0  0  0  1  0  0  0  0  0  0  0  1  0  0  5  0  0  2  5]
 [ 0 11  1  0  1  1  0  0  0  0  0  0  8  0  3  0  0  0  0  0]
 [ 0  2 16  1  0  0  0  0  0  0  0  0  6  0  0  0  0  0  0  0]
 [ 0  2  4  8  3  0  1  0  0  0  0  0  6  0  1  0  0  0  0  0]
 [ 0  2  1  3 10  0  2  0  0  0  0  0  3  0  0  0  1  0  3  0]
 [ 0  1  6  0  0  8  0  0  0  0  0  0 10  0  0  0  0  0  0  0]
 [ 0  1  0  1  0  0 17  0  0  0  2  0  4  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1 13  0  0  0  0  7  0  0  0  1  2  1  0]
 [ 0  0  0  1  1  0  2  1 14  1  0  0  4  0  0  0  1  0  0  0]
 [ 0  0  1  0  0  0  3  0  0 11  1  0  2  1  0  0  0  0  2  4]
 [ 0  0  0  0  0  0  4  0  1  2 16  0  0  0  0  0  1  0  0  1]
 [ 0  1  0  0  0  0  0  0  0  0  0 17  6  0  0  0  0  0  1  0]
 [ 0  1  0  3  0  1  0  1  0  1  0  0 14  1  1  1  0  0  1  0]
 [ 0  0  1  0  0  2  2  0  0  0  0  0 12  7  0  0  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  7  0 18  0  0  0  0  0]
 [ 4  0  0  0  0  0  0  0  2  0  0

In [None]:
print('Classification Report: \n\n', cr)

Classification Report: 

               precision    recall  f1-score   support

           0       0.55      0.44      0.49        25
           1       0.50      0.44      0.47        25
           2       0.50      0.64      0.56        25
           3       0.47      0.32      0.38        25
           4       0.53      0.40      0.45        25
           5       0.67      0.32      0.43        25
           6       0.52      0.68      0.59        25
           7       0.81      0.52      0.63        25
           8       0.74      0.56      0.64        25
           9       0.52      0.44      0.48        25
          10       0.84      0.64      0.73        25
          11       1.00      0.68      0.81        25
          12       0.12      0.56      0.20        25
          13       0.47      0.28      0.35        25
          14       0.72      0.72      0.72        25
          15       0.55      0.48      0.51        25
          16       0.33      0.24      0.28        25
 