In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
import warnings 
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import time
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
data=pd.read_csv("preprocessed_file.csv")
data.head(2)

Unnamed: 0,Time,clean text,Score
0,939340800,witty little book makes son laugh loud recite ...,1
1,1194739200,grew reading sendak books watching really rosi...,1


In [20]:
data.isnull().any().sum()

1

In [4]:
final_data=data.dropna()

In [5]:
final_data.isnull().any().sum()

0

In [6]:
sample_data=final_data.sample(100000,random_state=42)
print(sample_data.shape)
print(sample_data.columns)

(100000, 3)
Index(['Time', 'clean text', 'Score'], dtype='object')


In [7]:
X=sample_data['clean text']
Y=sample_data["Score"]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,stratify=Y,test_size=0.3,random_state=42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(70000,) (70000,)
(30000,) (30000,)


In [8]:
tfidf=TfidfVectorizer()
x_train_tfidf=tfidf.fit_transform(X_train)
print(type(x_train_tfidf))
print(x_train_tfidf.shape)

<class 'scipy.sparse._csr.csr_matrix'>
(70000, 51227)


In [9]:
x_test_tfidf=tfidf.transform(X_test)
print(type(x_test_tfidf))
print(x_test_tfidf.shape)


<class 'scipy.sparse._csr.csr_matrix'>
(30000, 51227)


In [10]:
s=StandardScaler(with_mean=False)
X_train_stand=s.fit_transform(x_train_tfidf)
X_test_stand=s.transform(x_test_tfidf)

In [16]:
start = time.time()
# creating list of C
C_values = np.linspace(0.1,1,10)

cv_scores = [] # empty list that will hold cv scores

# Try each value of alpha in the below loop
for c in C_values:
    # Create an object of the class Logistic Regression with balanced class weights
    clf = LogisticRegression(C = c, class_weight = 'balanced',max_iter=5,solver='saga')
    # perform 5-fold cross validation
    # It returns the cv accuracy for each fold in a list
    scores = cross_val_score(clf,X_train_stand, Y_train, cv=5, scoring='accuracy')
    # Store the mean of the accuracies from all the 5 folds
    cv_scores.append(scores.mean())

# calculate misclassification error from accuracy (error = 1 - accuracy)
cv_error = [1 - x for x in cv_scores]

# optimal (best) C is the one for which error is minimum (or accuracy is maximum)
optimal_C = C_values[cv_error.index(min(cv_error))]
print('\nThe optimal alpha is', optimal_C)

end = time.time()
print("Total time in minutes = ", (end-start)/60)


The optimal alpha is 1.0
Total time in minutes =  0.5416214823722839


In [19]:
knn_optimal =LogisticRegression(C=optimal_C)
# fitting the model
knn_optimal.fit(X_train_stand,Y_train)
# predict the response
pred = knn_optimal.predict(X_test_stand)

# evaluate accuracy
acc = accuracy_score(Y_test, pred) * 100
print('\nThe accuracy of the classifier for k = %d is %f%%' % (optimal_C, acc))


The accuracy of the classifier for k = 1 is 86.090000%


In [18]:
confusion_matrix(Y_test,pred)

array([[ 2838,  1868],
       [ 2305, 22989]], dtype=int64)