In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
combined = pd.read_csv('dataset/cleaned_combined.csv')

In [3]:
def export_result(file_name, prediction):
    submission = pd.DataFrame()
    submission['id'] = test['id']
    submission['target'] = prediction
    submission.to_csv(file_name, index=False)
    print('Exported')

In [4]:
df = combined['text']
x_train=combined[:train.shape[0]]
x_test=combined[train.shape[0]:]
y_train=train['target']

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [6]:
print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)

(6090, 7)
(6090,)
(1523, 7)
(1523,)


In [7]:
def check_model_performance(classifier):
    classifier = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', classifier),
                     ])
    classifier.fit(x_train['text'], y_train)
    predicted = classifier.predict(x_valid['text'])
    print(metrics.confusion_matrix(y_valid,predicted))
    print(metrics.classification_report(y_valid, predicted))
    print("validation score", metrics.accuracy_score(y_valid,predicted))
    print("training score", metrics.accuracy_score(y_train,classifier.predict(x_train['text'])))
    return classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier
classifier=check_model_performance(RandomForestClassifier(n_estimators=100))

[[777  97]
 [228 421]]
              precision    recall  f1-score   support

           0       0.77      0.89      0.83       874
           1       0.81      0.65      0.72       649

   micro avg       0.79      0.79      0.79      1523
   macro avg       0.79      0.77      0.77      1523
weighted avg       0.79      0.79      0.78      1523

validation score 0.7866053841103086
training score 0.99688013136289


In [9]:
from sklearn.linear_model import LogisticRegression
classifier=check_model_performance(LogisticRegression(solver='lbfgs'))

[[753 121]
 [189 460]]
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       874
           1       0.79      0.71      0.75       649

   micro avg       0.80      0.80      0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.79      1523

validation score 0.7964543663821405
training score 0.8885057471264368


In [10]:
from sklearn.ensemble import GradientBoostingClassifier
classifier=check_model_performance(GradientBoostingClassifier(n_estimators=500))

[[761 113]
 [215 434]]
              precision    recall  f1-score   support

           0       0.78      0.87      0.82       874
           1       0.79      0.67      0.73       649

   micro avg       0.78      0.78      0.78      1523
   macro avg       0.79      0.77      0.77      1523
weighted avg       0.79      0.78      0.78      1523

validation score 0.7846355876559422
training score 0.8847290640394089


In [11]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
classifier=check_model_performance(BaggingClassifier(KNeighborsClassifier()))

[[745 129]
 [225 424]]
              precision    recall  f1-score   support

           0       0.77      0.85      0.81       874
           1       0.77      0.65      0.71       649

   micro avg       0.77      0.77      0.77      1523
   macro avg       0.77      0.75      0.76      1523
weighted avg       0.77      0.77      0.76      1523

validation score 0.767564018384767
training score 0.8453201970443349


In [12]:
from sklearn.naive_bayes import MultinomialNB
classifier=check_model_performance(MultinomialNB())

[[785  89]
 [216 433]]
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       874
           1       0.83      0.67      0.74       649

   micro avg       0.80      0.80      0.80      1523
   macro avg       0.81      0.78      0.79      1523
weighted avg       0.80      0.80      0.80      1523

validation score 0.7997373604727511
training score 0.89376026272578


In [13]:
from sklearn.svm import LinearSVC
classifier=check_model_performance(LinearSVC())

[[721 153]
 [191 458]]
              precision    recall  f1-score   support

           0       0.79      0.82      0.81       874
           1       0.75      0.71      0.73       649

   micro avg       0.77      0.77      0.77      1523
   macro avg       0.77      0.77      0.77      1523
weighted avg       0.77      0.77      0.77      1523

validation score 0.7741300065659882
training score 0.9824302134646963


In [14]:
from sklearn.svm import SVC
classifier = check_model_performance(SVC(gamma='scale'))
export_result('submission_svm', classifier.predict(x_test['text']))

[[773 101]
 [205 444]]
              precision    recall  f1-score   support

           0       0.79      0.88      0.83       874
           1       0.81      0.68      0.74       649

   micro avg       0.80      0.80      0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.80      1523

validation score 0.799080761654629
training score 0.9735632183908046
Exported


In [15]:
from sklearn import tree
classifier=check_model_performance(tree.DecisionTreeClassifier())

[[643 231]
 [212 437]]
              precision    recall  f1-score   support

           0       0.75      0.74      0.74       874
           1       0.65      0.67      0.66       649

   micro avg       0.71      0.71      0.71      1523
   macro avg       0.70      0.70      0.70      1523
weighted avg       0.71      0.71      0.71      1523

validation score 0.7091267235718975
training score 0.99688013136289


In [16]:
import xgboost as xgb
classifier=check_model_performance(xgb.XGBClassifier())

[[816  58]
 [324 325]]
              precision    recall  f1-score   support

           0       0.72      0.93      0.81       874
           1       0.85      0.50      0.63       649

   micro avg       0.75      0.75      0.75      1523
   macro avg       0.78      0.72      0.72      1523
weighted avg       0.77      0.75      0.73      1523

validation score 0.7491792514773473
training score 0.7779967159277504
