In [13]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [15]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
comments_test = df_test["CONTENT"]
print(comments_test)

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
print(df)

In [None]:
print(df_test)
df_test.head()

In [None]:
df.isnull().sum()

We can see that the content and class column has 0 missing values. Only some dates missing which is pretty irrelevant. Let's remove irrelevant columns. 

In [None]:
df = df.drop(["DATE", "VIDEO_NAME"], axis = 1)

In [None]:
df.head(10)

Let's create our training and testing data. 20% of data for testing and 80% for training

In [None]:
X = df["CONTENT"]
Y = df["CLASS"]
print(X)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3)


In [None]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

#let's make sure the labels for Y are in int form e.g 0, 1 and not any other like "0", "1"
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
test_data_X = feature_extraction.transform(comments_test)

In [None]:
X_train_features.shape

In [None]:
print(X_train_features)
X_train_features.shape

We created the TF-IDF matrix where each sentence represented as a vector with weighing given to certain words. 
Notice how there are 2821 columns meaning there are 2821 features (DAMN). 

In [None]:
model = LogisticRegression() #create an instance of a logistic regression model I can train

In [None]:
model.fit(X_train_features, Y_train) #training logistic regression model on training data using MLE

In [None]:
#the next line takes the matrix of training data and spits out an array of predictions full of 
#0s and 1s, classifying each vector (representation of a sentence) as spam or non-spam
prediction_on_training_data = model.predict(X_train_features) 

#accuracy score = # of correct predictions / Total # of predictions 
#CALCULATING 1 - TRAINING ERROR RATE 
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print(accuracy_on_training_data)

In [None]:
test_data_prediction = model.predict(test_data_X) 
print(test_data_prediction)
test_data_prediction.shape
df_test["CLASS"] = test_data_prediction
df_test.head()



In [None]:
# Drop every column except for commentID and Class
df_test = df_test.drop(["AUTHOR", "DATE", "CONTENT", "VIDEO_NAME"], axis = 1)
df_test.head()

In [None]:
#Store df_test in a .csv file
df_test.to_csv("classified_result.csv", index=False)

In [None]:
# print(X_train_features)
prediction_on_training_data

In [None]:
print(accuracy_on_training_data)

Our predictions were correct 97.5% of the time using the logistic regression model we fitted. 

In [None]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [None]:
print(accuracy_on_test_data)

Pretty good accuracy on test data as well. 

In [None]:
input_your_comment = [""]
input_data_features = feature_extraction.transform(input_your_comment)
prediction = model.predict(input_data_features)
print(prediction)


In [None]:
df2 = pd.read_csv("test.csv")

In [None]:
df2.head()

In [None]:
X1 = df2["CONTENT"]
#No Y labels. How can I compare this to true labels? idk
transform_X1 = feature_extraction.transform(X1)
new_prediction = model.predict(transform_X1)

In [None]:
# Now that we have a model above that works, we will look at grid-search CV methods to search for the best model using hyperparameter tuning

# Consturct a parameter grid 
param_grid = [
    {'penalty':['l1','l2','elasticnet','none'],
    'C' : np.logspace(-4,4,20),
    'solver': ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter'  : [100,1000,2500,5000]
}
]


In [None]:

# Load grid-search-cv frok sklearn and fit the model 
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(model,param_grid = param_grid, cv = 3, verbose=True,n_jobs=-1)

best_clf = clf.fit(X_train_features, Y_train)
best_clf.best_estimator_

In [None]:
# Checking the accuracy score on the best fit model
print(f'Accuracy after tuninig parameters - : {best_clf.score(X_train_features, Y_train):.3f}')
# accuracy_score(Y_train, prediction_on_training_data)

In [None]:
clf_prediction_on_training_data = clf.predict(X_train_features) 

#accuracy score = # of correct predictions / Total # of predictions 
#CALCULATING 1 - TRAINING ERROR RATE 
clf_accuracy_on_training_data = accuracy_score(Y_train, clf_prediction_on_training_data)
print(clf_accuracy_on_training_data)

In [None]:
clf_prediction_on_test_data = clf.predict(X_test_features)
clf_accuracy_on_test_data = accuracy_score(Y_test, clf_prediction_on_test_data)
print(clf_accuracy_on_test_data)