In [1]:
import os
from nltk.corpus import stopwords
from nltk import word_tokenize
from string import punctuation
import random
import numpy as np

# Data Preprocessing

In [2]:
stop_words = stopwords.words('english')

In [3]:
negative_documents = []
max_len_negative = 0
for file in os.listdir('train_data/neg'):
    with open('train_data/neg/' + file) as f:
        text = f.read()
        tokens = word_tokenize(text)
        translator = str.maketrans('', '', punctuation)
        tokens = [w.translate(translator) for w in tokens]
        tokens = [w for w in tokens if not w in stop_words]
        if len(tokens) > max_len_negative:
            max_len_negative = len(tokens)
        negative_documents.append(' '.join(tokens))
len(negative_documents)

1000

In [4]:
positive_documents = []
max_len_positive = 0
for file in os.listdir('train_data/pos'):
    with open('train_data/pos/' + file) as f:
        text = f.read()
        tokens = word_tokenize(text)
        translator = str.maketrans('', '', punctuation)
        tokens = [w.translate(translator) for w in tokens]
        tokens = [w for w in tokens if not w in stop_words]
        if len(tokens) > max_len_positive:
            max_len_positive = len(tokens)
        positive_documents.append(' '.join(tokens))
len(positive_documents)

1000

In [5]:
X = negative_documents[:] + positive_documents[:]


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
vectorizer.fit(X)

In [8]:
X = vectorizer.transform(X)

In [9]:
X

<2000x571495 sparse matrix of type '<class 'numpy.float64'>'
	with 1212450 stored elements in Compressed Sparse Row format>

In [10]:
import numpy as np

In [11]:
y = np.concatenate((np.zeros(1000),np.ones(1000)) , axis=0)

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

# SVC

In [14]:
from sklearn import svm

In [15]:
svmc = svm.SVC(verbose=True, C=1 , kernel='linear' , cache_size=1000 , random_state=333)
svmc.fit(X_train, y_train)
svmc.score(X_train, y_train)

[LibSVM]

1.0

In [16]:
svmc.score(X_test, y_test)

0.876

In [48]:
y_pred_svm = svmc.predict(X_test)

# DT

In [19]:
from sklearn import tree

In [20]:
dt = tree.DecisionTreeClassifier( criterion='entropy', splitter='best', max_depth=None , min_samples_split=10, min_samples_leaf=5, random_state=111, ccp_alpha=0.0)
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.9446666666666667

In [21]:
dt.score(X_test, y_test)

0.618

In [49]:
y_pred_dt = dt.predict(X_test)

# Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=100, min_samples_leaf=10, max_features='sqrt', max_leaf_nodes=None , bootstrap=True, n_jobs=-1, random_state=220, verbose=1)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.5s finished


0.9353333333333333

In [25]:
rf.score(X_test, y_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.5s finished


0.804

In [50]:
y_pred_rf = rf.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.4s finished


# CatBoost Classifier

In [27]:
from catboost import CatBoostClassifier

In [46]:
cat = CatBoostClassifier(iterations=61,
                         learning_rate=0.1,
                         depth=None,
                         loss_function=None,
                         use_best_model=None,
                         verbose=None,
                         eval_metric='Accuracy',
                         boosting_type=None,
                         bootstrap_type=None,
                         max_depth=None,
                         n_estimators=None,
                         num_trees=None,
                         random_state=1111,
                         reg_lambda=None,
                         objective=None,
                         cat_features=None,
                         grow_policy=None,
                         min_data_in_leaf=10,
                         min_child_samples=None,
                         score_function=None,
                         task_type=None
                         )

In [47]:
cat.fit(X_train , y_train , eval_set=(X_test,y_test) , plot=True )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6653333	test: 0.6480000	best: 0.6480000 (0)	total: 1.51s	remaining: 1m 30s
1:	learn: 0.6700000	test: 0.6420000	best: 0.6480000 (0)	total: 2.92s	remaining: 1m 26s
2:	learn: 0.6780000	test: 0.6600000	best: 0.6600000 (2)	total: 4.23s	remaining: 1m 21s
3:	learn: 0.6913333	test: 0.6760000	best: 0.6760000 (3)	total: 5.53s	remaining: 1m 18s
4:	learn: 0.7066667	test: 0.6960000	best: 0.6960000 (4)	total: 6.82s	remaining: 1m 16s
5:	learn: 0.7260000	test: 0.7000000	best: 0.7000000 (5)	total: 8.1s	remaining: 1m 14s
6:	learn: 0.7266667	test: 0.7000000	best: 0.7000000 (5)	total: 9.4s	remaining: 1m 12s
7:	learn: 0.7433333	test: 0.7160000	best: 0.7160000 (7)	total: 10.8s	remaining: 1m 11s
8:	learn: 0.7586667	test: 0.7140000	best: 0.7160000 (7)	total: 12.1s	remaining: 1m 9s
9:	learn: 0.7600000	test: 0.7180000	best: 0.7180000 (9)	total: 13.4s	remaining: 1m 8s
10:	learn: 0.7573333	test: 0.7220000	best: 0.7220000 (10)	total: 14.7s	remaining: 1m 6s
11:	learn: 0.7653333	test: 0.7280000	best: 0.7

<catboost.core.CatBoostClassifier at 0x12f76c80b50>

In [51]:
y_pred_cat = cat.predict(X_test)

# Classification Report for Models

In [59]:
from sklearn.metrics import classification_report, confusion_matrix
print('SVM:',classification_report(y_test, y_pred_svm))
print()
print('CAT:',classification_report(y_test, y_pred_cat)) 
print()
print('RF:' , classification_report(y_test, y_pred_rf))
print()
print('DT:' , classification_report(y_test, y_pred_dt))
     

SVM:               precision    recall  f1-score   support

         0.0       0.89      0.86      0.88       253
         1.0       0.86      0.89      0.88       247

    accuracy                           0.88       500
   macro avg       0.88      0.88      0.88       500
weighted avg       0.88      0.88      0.88       500


CAT:               precision    recall  f1-score   support

         0.0       0.88      0.75      0.81       253
         1.0       0.78      0.90      0.83       247

    accuracy                           0.82       500
   macro avg       0.83      0.82      0.82       500
weighted avg       0.83      0.82      0.82       500


RF:               precision    recall  f1-score   support

         0.0       0.80      0.81      0.81       253
         1.0       0.80      0.80      0.80       247

    accuracy                           0.80       500
   macro avg       0.80      0.80      0.80       500
weighted avg       0.80      0.80      0.80       500


DT