In [1]:
import os
import numpy as np 
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [5]:
train_data_path = os.path.join(os.getcwd(), 'Data','except-cornell.csv')
train = pd.read_csv(train_data_path, encoding='Latin-1')
test_data_path = os.path.join(os.getcwd(), 'Data','cornell.csv')
test = pd.read_csv(test_data_path, encoding='Latin-1')

In [6]:
 X_train=train.drop(['label'], axis=1) #features
y_train=train['label'] #targets
X_test=test.drop(['label'], axis=1) #features
y_test=test['label'] #targets

In [7]:
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(X_train, y_train,copy =False)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

mi= pd.Series(mi)
mi.index = X_train.columns
mi.sort_values(ascending=False, inplace=True)

In [8]:
from sklearn.feature_selection import SelectPercentile, chi2
selector = SelectPercentile(mutual_info_classif, percentile=10).fit(X_train, y_train)
print(len(X_train.columns[selector.get_support()]))

X_train_mi = selector.transform(X_train)
X_test_mi = selector.transform(X_test)

3273


In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_vect = TfidfTransformer()
train_tfidf = tfidf_vect.fit_transform(X_train_mi)
test_tfidf = tfidf_vect.fit_transform(X_test_mi)

X_train_tfidf=train_tfidf.toarray()
X_test_tfidf=test_tfidf.toarray()

In [14]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
skf = StratifiedKFold(n_splits=4,random_state=0,shuffle=True)

list_f1=[]
list_accuracy=[]
list_Precision=[]
list_Recall=[]

for train_la, valid_la in (skf.split(X_train_tfidf,y_train)):
    X_train_k, X_validation = X_train_tfidf[train_la], X_train_tfidf[valid_la]
    y_train_k, y_validation = y_train[train_la], y_train[valid_la]
    lr=LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
    lr.fit(X_train_k, y_train_k)  
    a=lr.predict(X_test_tfidf)
    
    f1_score=metrics.f1_score(y_test, a, average='micro')
    accuracy=accuracy_score(y_test, a)
    p = precision_score(y_test, a, average='micro')
    R = recall_score(y_test, a, average='micro')
        
    list_f1.append(f1_score)
    list_accuracy.append(accuracy)
    list_Precision.append(p)
    list_Recall.append(R)
    
print('Average precision score mean:', np.mean(list_Precision))
print('Average precision score Standard Deviation:', np.std(list_Precision))
print('Average recall score mean:', np.mean(list_Recall))
print('Average recall score Standard Deviation:', np.std(list_Recall)) 
print('f1_score mean:', np.mean(list_f1))
print('f1_score Standard Deviation:', np.std(list_f1))
print('accuracy mean:', np.mean(list_accuracy))
print('accuracy Standard Deviation:', np.std(list_accuracy))


Average precision score mean: 0.40960501405726
Average precision score Standard Deviation: 0.004590557720522573
Average recall score mean: 0.6415846514430049
Average recall score Standard Deviation: 0.009173293001842038
f1_score mean: 0.7673010380622838
f1_score Standard Deviation: 0.0029827221547833512
accuracy mean: 0.7673010380622838
accuracy Standard Deviation: 0.0029827221547833512
