### 0. Imports

In [1]:
import numpy as np
import pandas as pd
import pickle as pkl

### 1. Read Feature Dictionary and Train Set

In [2]:
with open("data/feature_dict.pkl","rb") as in_file:
    feature_dict = pkl.load(in_file)

In [3]:
len(feature_dict)

44666

In [4]:
train_df = pd.read_csv("data/preprocessed_train_set.csv").fillna("")

In [5]:
len(train_df)

79998

### 2. Vectorize and get X (Feature Vectors) and Y (labels)

In [6]:
def vectorize(text):
    words = text.split()
    pos_val, neg_val = 0, 0
    for word in words:
        val_lkpup = feature_dict.get(word,{})
        pos_val += val_lkpup.get("pos_freq", 0)
        neg_val += val_lkpup.get("neg_freq", 0)
    return [1, pos_val, neg_val]

In [7]:
X = np.array([vectorize(text) for text in list(train_df["SentimentText"])])

In [8]:
X.shape

(79998, 3)

In [9]:
Y = np.array(list(train_df["Sentiment"]))

In [10]:
Y.shape

(79998,)

### 3. Split into Train and validation set

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((55998, 3), (24000, 3), (55998,), (24000,))

### 4. Logistic Regression Training

In [14]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [15]:
logreg.fit(X_train, y_train)

LogisticRegression()

### 5. Validation set Metrics

In [16]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.70


In [17]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[ 6557  3918]
 [ 3259 10266]]


In [18]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.63      0.65     10475
           1       0.72      0.76      0.74     13525

    accuracy                           0.70     24000
   macro avg       0.70      0.69      0.69     24000
weighted avg       0.70      0.70      0.70     24000



### 6. Save model

In [19]:
import pickle as pkl

In [20]:
with open("data/logistic_regressor.pkl", "wb") as model_output:
    pkl.dump(logreg, model_output)