In [2]:
## Python in build modules:
import os
import re
from collections import Counter
import time
import sys

## EDA libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Metrics (sklearn)
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

## Models (sklearn)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [3]:
## Importing the data

path = '../data/train/'

count = 0
labels = []
contents = []

for label in ['neg','pos']:
    filenames = os.listdir(path + label)
    for filename in filenames:
        count += 1
        with open(os.path.join(path, label, filename), 'r') as f:
            labels.append(1 if label == 'pos' else 0) # 1 is positve 0 is negative
            contents.append(f.read())
print(count)
            
data = pd.DataFrame({
    'contents' : contents,
    'labels': labels,

})

data = data.sample(frac=1, random_state=42).reset_index(drop=True) # This code will shuffle the data (just in case!)
X = data.contents
y = data.labels

25000


In [5]:
X_sm = X[:int(len(X)/4)]
y_sm = y[:int(len(y)/4)]

In [5]:
pipeline = Pipeline(
    [
        ("Tfidf"  , TfidfVectorizer(ngram_range=(1,1), min_df=2)),
        ("scaler" , StandardScaler(with_mean=False)),
        ("clf"    , SVC(gamma='auto')),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_sm, y_sm, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  24.6s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.7s remaining:    0.0s


[CV] END .................................................... total time=  24.8s
[CV] END .................................................... total time=  24.8s
[CV] END .................................................... total time=  24.9s
[CV] END .................................................... total time=  24.8s
84.72
0.744


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished


In [10]:
pipeline = Pipeline(
    [
        ("Tfidf"  , TfidfVectorizer(ngram_range=(1,1), min_df=2)),
        ("scaler" , StandardScaler(with_mean=False)),
        ("clf"    , SVC(gamma='auto', kernel="linear",verbose=True, cache_size=400)),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_sm, y_sm, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibSVM][CV] END .................................................... total time=  23.5s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.5s remaining:    0.0s


[LibSVM][CV] END .................................................... total time=  23.7s
[LibSVM][CV] END .................................................... total time=  23.8s
[LibSVM][CV] END .................................................... total time=  23.6s
[LibSVM][CV] END .................................................... total time=  23.6s
83.19
0.596


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.0min finished


In [11]:
pipeline = Pipeline(
    [
        ("Tfidf"  , TfidfVectorizer(ngram_range=(1,1), min_df=2)),
        ("scaler" , StandardScaler(with_mean=False)),
        ("clf"    , SVC(gamma='scale',verbose=True, cache_size=400)),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_sm, y_sm, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibSVM][CV] END .................................................... total time=  25.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   25.0s remaining:    0.0s


[LibSVM][CV] END .................................................... total time=  24.9s
[LibSVM][CV] END .................................................... total time=  25.5s
[LibSVM][CV] END .................................................... total time=  25.2s
[LibSVM][CV] END .................................................... total time=  24.6s
84.72
0.731


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished


In [12]:
pipeline = Pipeline(
    [
        ("Tfidf"  , TfidfVectorizer(ngram_range=(1,3), min_df=20, max_df=.5)),
        ("scaler" , StandardScaler(with_mean=False)),
        ("clf"    , SVC(gamma='auto',verbose=True)),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_sm, y_sm, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibSVM][CV] END .................................................... total time=  32.6s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.6s remaining:    0.0s


[LibSVM][CV] END .................................................... total time=  32.4s
[LibSVM][CV] END .................................................... total time=  32.4s
[LibSVM][CV] END .................................................... total time=  32.2s
[LibSVM][CV] END .................................................... total time=  32.1s
87.39
0.894


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.7min finished


In [16]:
X_sm = X[:8000]
y_sm = y[:8000]

In [19]:
pipeline = Pipeline(
    [
        ("Tfidf"  , TfidfVectorizer(ngram_range=(1,3), min_df=20, max_df=.5)),
        ("scaler" , StandardScaler(with_mean=False)),
        ("clf"    , SVC(gamma='auto',verbose=True)),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_sm, y_sm, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibSVM][CV] END .................................................... total time=  57.3s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   57.3s remaining:    0.0s


[LibSVM][CV] END .................................................... total time=  57.8s
[LibSVM][CV] END .................................................... total time=  55.4s
[LibSVM][CV] END .................................................... total time=  55.0s
[LibSVM][CV] END .................................................... total time=  55.1s
87.69
0.603


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.7min finished


In [21]:
pipeline = Pipeline(
    [
        ("Tfidf"  , TfidfVectorizer(ngram_range=(1,3), min_df=20, max_df=.7)),
        ("scaler" , StandardScaler(with_mean=False)),
        ("clf"    , SVC(gamma='scale',verbose=True)),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_sm, y_sm, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibSVM][CV] END .................................................... total time=  57.8s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   57.8s remaining:    0.0s


[LibSVM][CV] END .................................................... total time=  57.4s
[LibSVM][CV] END .................................................... total time=  57.1s
[LibSVM][CV] END .................................................... total time=  58.3s
[LibSVM][CV] END .................................................... total time=  58.1s
87.66
0.654


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.8min finished


In [22]:
pipeline = Pipeline(
    [
        ("Tfidf"  , TfidfVectorizer(ngram_range=(1,3), min_df=15, max_df=.5)),
        ("scaler" , StandardScaler(with_mean=False)),
        ("clf"    , SVC(gamma='auto',verbose=True)),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_sm, y_sm, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibSVM][CV] END .................................................... total time=  58.1s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   58.1s remaining:    0.0s


[LibSVM][CV] END .................................................... total time= 1.0min
[LibSVM][CV] END .................................................... total time=  58.3s
[LibSVM][CV] END .................................................... total time=  57.4s
[LibSVM][CV] END .................................................... total time=  58.0s
88.16
0.373


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.9min finished


In [25]:
import nltk
from nltk.corpus import stopwords
english_sw = set(stopwords.words('english'))

In [27]:
pipeline = Pipeline(
    [
        ("Tfidf"  , TfidfVectorizer(ngram_range=(1,3), min_df=15, stop_words=english_sw)),
        ("scaler" , StandardScaler(with_mean=False)),
        ("clf"    , SVC(gamma='auto',verbose=True)),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_sm, y_sm, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibSVM][CV] END .................................................... total time=  30.7s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   30.7s remaining:    0.0s


[LibSVM][CV] END .................................................... total time=  29.7s
[LibSVM][CV] END .................................................... total time=  29.5s
[LibSVM][CV] END .................................................... total time=  28.8s
[LibSVM][CV] END .................................................... total time=  28.8s
86.72
0.574


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.5min finished


In [29]:
pipeline = Pipeline(
    [
        ("Tfidf"  , TfidfVectorizer(ngram_range=(1,3), min_df=12, max_df=.5)),
        ("scaler" , StandardScaler(with_mean=False)),
        ("clf"    , SVC(gamma='auto',verbose=True)),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_sm, y_sm, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibSVM][CV] END .................................................... total time= 1.0min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.0min remaining:    0.0s


[LibSVM][CV] END .................................................... total time= 1.0min
[LibSVM][CV] END .................................................... total time= 1.0min
[LibSVM][CV] END .................................................... total time=  59.6s
[LibSVM][CV] END .................................................... total time= 1.0min
88.17
0.395


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.1min finished


In [30]:
X_sm = X[:10000]
y_sm = y[:10000]

In [31]:
pipeline = Pipeline(
    [
        ("Tfidf"  , TfidfVectorizer(ngram_range=(1,3), min_df=15, max_df=.5)),
        ("scaler" , StandardScaler(with_mean=False)),
        ("clf"    , SVC(gamma='auto',verbose=True)),
    ]
)
cv = 5
scorer = make_scorer(f1_score)
result = cross_val_score(pipeline, X_sm, y_sm, cv=cv, scoring=scorer, verbose=2)

print(round(result.mean() * 100 , 2))
print(round(result.std()  * 100 , 3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibSVM][CV] END .................................................... total time= 1.9min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.9min remaining:    0.0s


[LibSVM][CV] END .................................................... total time= 1.9min
[LibSVM][CV] END .................................................... total time= 2.0min
[LibSVM][CV] END .................................................... total time= 1.9min
[LibSVM][CV] END .................................................... total time= 1.9min
88.6
0.548


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  9.6min finished
