In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv("preprocessed_file.csv")
data.head()

Unnamed: 0,Time,clean text,Score
0,939340800,witty little book makes son laugh loud recite ...,1
1,1194739200,grew reading sendak books watching really rosi...,1
2,1191456000,fun way children learn months year learn poems...,1
3,1076025600,great little book read aloud nice rhythm well ...,1
4,1018396800,book poetry months year goes month cute little...,1


In [3]:
data.isnull().any()

Time          False
clean text     True
Score         False
dtype: bool

In [4]:
final_data=data.dropna()
final_data.isnull().any()

Time          False
clean text    False
Score         False
dtype: bool

In [5]:
final_data["Score"].value_counts()

1    306201
0     56991
Name: Score, dtype: int64

In [6]:
df=final_data.sample(100000,random_state=42)

In [9]:
x=df["clean text"]
y=df["Score"]
x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,test_size=0.3)

In [10]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(70000,) (70000,)
(30000,) (30000,)


In [11]:
bow = CountVectorizer()
# Call the fit_transform method on training data
x_train_bow = bow.fit_transform(x_train.values)
x_train_bow.shape

(70000, 51399)

In [12]:
x_test_bow = bow.transform(x_test.values)
x_test_bow.shape

(30000, 51399)

In [13]:
std = StandardScaler(with_mean=False)
x_train_stand = std.fit_transform(x_train_bow)
x_test_stand= std.transform(x_test_bow)

In [14]:
start = time.time()
# creating list of C
C_values = np.linspace(0.1,1,10)

cv_scores = [] # empty list that will hold cv scores

# Try each value of alpha in the below loop
for c in C_values:
    # Create an object of the class Logistic Regression with balanced class weights
    clf = LogisticRegression(C = c, class_weight = 'balanced',max_iter=5,solver='saga')
    # perform 5-fold cross validation
    # It returns the cv accuracy for each fold in a list
    scores = cross_val_score(clf,x_train_stand, y_train, cv=5, scoring='accuracy')
    # Store the mean of the accuracies from all the 5 folds
    cv_scores.append(scores.mean())

# calculate misclassification error from accuracy (error = 1 - accuracy)
cv_error = [1 - x for x in cv_scores]

# optimal (best) C is the one for which error is minimum (or accuracy is maximum)
optimal_C = C_values[cv_error.index(min(cv_error))]
print('\nThe optimal alpha is', optimal_C)

end = time.time()
print("Total time in minutes = ", (end-start)/60)


The optimal alpha is 0.30000000000000004
Total time in minutes =  0.3672641674677531


In [15]:
knn_optimal =LogisticRegression(C=optimal_C)
# fitting the model
knn_optimal.fit(x_train_stand,y_train)
# predict the response
pred = knn_optimal.predict(x_test_stand)

# evaluate accuracy
acc = accuracy_score(y_test, pred) * 100
print('\nThe accuracy of the classifier for k = %d is %f%%' % (optimal_C, acc))


The accuracy of the classifier for k = 0 is 87.803333%


In [16]:
confusion_matrix(y_test,pred)

array([[ 2976,  1730],
       [ 1929, 23365]], dtype=int64)