In [3]:
%run -i "data_processing.py"

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mapal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['ad', 'sal', 'boo', 'tim', 'war', 'profi', 'quarterly', 'profi', 'us', 'media', 'gia', 'timewar', 'jumped', '76', '1', '13b', 'â', '600m', 'three', 'month', 'decemb', '639m', 'year', 'earlier', 'firm', 'one', 'bigg', 'investor', 'googl', 'benefited', 'sal', 'high', 'speed', 'inter', 'connectio', 'high', 'adver', 'sal', 'timewar', 'said', 'fourth', 'quar', 'sal', 'ros', '2', '11', '1bn', '10', '9bn', 'profi', 'buoyed', 'one', 'gai', 'off', 'profi', 'dip', 'war', 'bro', 'less', 'user', 'aol', 'tim', 'war', 'said', 'friday', 'own', '8', 'search', 'engi', 'googl', 'inter', 'business', 'aol', 'mixed', 'fortu', 'los', '464', '000', 'subscrib', 'fourth', 'quar', 'profi', 'lower', 'preceding', 'three', 'quar', 'howev', 'company', 'said', 'aol', 'underlying', 'profi', 'exceptional', 'item', 'ros', '8', 'back', 'strong', 'inter', 'advertising', 'revenu', 'hop', 'increa', 'subscrib', 'offering', 'onli', 'servic', 'free', 'timewar', 'inter', 'custom', 'try', 'sig', 'aol', 'existing', 'custom', 'h

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

In [5]:
x = np.load('../data/features/tokenized.npy')
y = np.load('../data/features/labels.npy')

## TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_fun(doc):
    return doc

vect = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None, 
    min_df=2,
    max_df=0.5,
    ngram_range=(1,1))

X = vect.fit_transform(x)
Y = y

print ("no of features extracted:", X.shape[1])

no of features extracted: 14850


## Count Vectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy_fun(doc):
    return doc

countvect = CountVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None, 
    min_df=2,
    max_df=0.5,
    ngram_range=(1,1))

X = vect.fit_transform(x)
Y = y

print ("no of features extracted:", X.shape[1])

no of features extracted: 14850


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)

print ("train size:", X_train.shape)
print ("test size:", X_test.shape)
print ("class distribution in training set:", pd.Series(y_train).value_counts())
print ("class distribution in test set:", pd.Series(y_test).value_counts())

train size: (1557, 14850)
test size: (668, 14850)
class distribution in training set: 3    357
0    357
2    292
4    281
1    270
dtype: int64
class distribution in test set: 3    154
0    153
2    125
4    120
1    116
dtype: int64


In [24]:
# splitting into training and validation set
X_training, X_val, y_training, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42, stratify=y_train)

## Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression

In [25]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:    
    lr = LogisticRegression(C=c)
    lr.fit(X_training, y_training)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.47863247863247865
Accuracy for C=0.05: 0.7564102564102564
Accuracy for C=0.25: 0.9529914529914529
Accuracy for C=0.5: 0.9722222222222222
Accuracy for C=1: 0.9786324786324786


In [26]:
model = LogisticRegression(C=1)
model.fit(X_training, y_training)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, model.predict(X_test)))

Final Accuracy: 0.9745508982035929


In [27]:
c_mat = confusion_matrix(y_test,model.predict(X_test))
print ("Confusion Matrix:\n", c_mat)

Confusion Matrix:
 [[149   0   2   1   1]
 [  0 114   2   0   0]
 [  6   0 117   1   1]
 [  0   0   0 154   0]
 [  2   0   0   1 117]]


## SVM

In [22]:
from sklearn.svm import LinearSVC

In [28]:
for c in [0.001, 0.005, 0.01, 0.05, 0.1]:    
    svm = LinearSVC(C=c)
    svm.fit(X_training, y_training)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

Accuracy for C=0.001: 0.47649572649572647
Accuracy for C=0.005: 0.8012820512820513
Accuracy for C=0.01: 0.9081196581196581
Accuracy for C=0.05: 0.9722222222222222
Accuracy for C=0.1: 0.9786324786324786


In [29]:
model = LinearSVC(C=1)
model.fit(X_training, y_training)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, model.predict(X_test)))

Final Accuracy: 0.9790419161676647


In [30]:
c_mat = confusion_matrix(y_test,model.predict(X_test))
print ("Confusion Matrix:\n", c_mat)

Confusion Matrix:
 [[147   1   3   1   1]
 [  0 114   2   0   0]
 [  3   0 121   0   1]
 [  0   0   0 154   0]
 [  1   1   0   0 118]]


## Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [32]:
model = RandomForestClassifier(n_estimators=300, max_depth=150,n_jobs=1)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=150, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [34]:
y_pred = model.predict(X_test)
c_mat = confusion_matrix(y_test,y_pred)
acc = accuracy_score(y_test,y_pred)

print ("Confusion Matrix:\n", c_mat)
print ("\nAccuracy: ",acc)

Confusion Matrix:
 [[150   0   2   0   1]
 [  2 113   1   0   0]
 [  9   0 115   0   1]
 [  0   0   0 154   0]
 [  3   1   1   0 115]]

Accuracy:  0.968562874251497
