In [15]:
from utils import read_process_data
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd

## Data loading

In [2]:
data_path = 'data'
train_csv = data_path + '/train.csv'
test_csv  = data_path + '/test.csv'

contractions_csv = data_path + '/contractions.csv'

dict_sent_to_label = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

In [3]:
df_train = read_process_data(train_csv, True, False, contractions_csv)
df_test  = read_process_data(test_csv , True, False, contractions_csv)

In [4]:
print(df_train["text"].isna().sum(), df_train["sentiment"].isna().sum(), len(df_train))

if df_train["text"].isna().sum() > 0 or df_train["sentiment"].isna().sum() > 0:
    df_train.dropna(inplace=True)

df_train["label"] = df_train["sentiment"].map(dict_sent_to_label)
df_train["label"] = df_train["label"].astype('int')

0 0 27481


In [5]:
print(df_test["text"].isna().sum(), df_test["sentiment"].isna().sum(), len(df_test))

if df_test["text"].isna().sum() > 0 or df_test["sentiment"].isna().sum() > 0:
    df_test.dropna(inplace=True)

df_test["label"] = df_test["sentiment"].map(dict_sent_to_label)
df_test["label"] = df_test["label"].astype('int')

0 1281 4815


## Tf-Idf

In [16]:
def get_features(df_train, df_test, text_col_name, label_col_name, max_ngram):
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, max_ngram))

    features_train = tfidf.fit_transform(df_train[text_col_name]).toarray()
    labels_train = df_train[label_col_name]

    features_test = tfidf.transform(df_test[text_col_name]).toarray()
    labels_test = df_test[label_col_name]

    return features_train, labels_train, features_test, labels_test

In [17]:
features_train, labels_train, features_test, labels_test = get_features(df_train, df_test, "text_preprocessed", "label", 3)

## Explore multiclassification models

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [9]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=2018),
    SVC(random_state=2018),
    MultinomialNB(),
    LogisticRegression(random_state=2018),
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

In [10]:
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features_train, labels_train, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [11]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis=1, ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

Unnamed: 0_level_0,Mean Accuracy,Standard deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
LogisticRegression,0.691896,0.006208
MultinomialNB,0.644154,0.004827
RandomForestClassifier,0.433354,0.004468
SVC,0.693134,0.005827
