In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, classification_report

In [2]:
df = pd.read_csv('UMICH_SI650_Sentiment_Classification/training.txt', sep='\t', names=['liked', 'txt'])  # For CSV or comma-delimited text files

In [3]:
df.head(5)

Unnamed: 0,liked,txt
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [4]:
# tFidf Vectorizer
stopset = list(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [5]:
# Dependent variable will be liked as 0(didn't like) and 1n (liked)
y_train =df.liked

In [6]:
# Convert df.txt from text to features
x_train = vectorizer.fit_transform(df['txt'])

In [7]:
# 6912 observations X 2022 unique words.
print(y_train.shape)
print(x_train.shape)

(6918,)
(6918, 2011)


In [8]:
x_train, x_train_test, y_train, y_train_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [9]:
# We'll train a naive bais classifier
cl = naive_bayes.MultinomialNB()
cl.fit(x_train, y_train)

In [10]:
# Make predictions
y_pred = cl.predict(x_train_test)

# Evaluate performance
accuracy = accuracy_score(y_train_test, y_pred)
precision = precision_score(y_train_test, y_pred)
report = classification_report(y_train_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print('Classification Report:')
print(report)

Accuracy: 0.9747109826589595
Precision: 0.9694749694749695
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       580
           1       0.97      0.99      0.98       804

    accuracy                           0.97      1384
   macro avg       0.98      0.97      0.97      1384
weighted avg       0.97      0.97      0.97      1384



In [15]:
roc_auc_score(y_train_test, cl.predict_proba(x_train_test)[:,1])

0.9983058843712472

In [11]:
import csv

with open('UMICH_SI650_Sentiment_Classification/testdata.txt', 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter='\n')
    lines = [row[0] for row in reader if row]

test = pd.DataFrame(lines, columns=['txt'])


In [12]:
test.head(3)

Unnamed: 0,txt
0,"I don't care what anyone says, I like Hillary..."
1,"harvard is dumb, i mean they really have to be..."
2,I'm loving Shanghai > > > ^ _ ^.


In [13]:
# Transform the test data using the same vectorizer
x_test = vectorizer.transform(test['txt'])

In [16]:
# Make predictions
y_test_pred = cl.predict(x_test)

In [17]:
# Create a DataFrame with the predictions
test['liked'] = y_test_pred

In [19]:
# Verify the DataFrame content
print(test.head())

                                                 txt  liked
0   I don't care what anyone says, I like Hillary...      1
1  harvard is dumb, i mean they really have to be...      0
2                   I'm loving Shanghai > > > ^ _ ^.      1
3                        harvard is for dumb people.      1
4  As i stepped out of my beautiful Toyota, i hea...      1


In [21]:
# Save the results to a new comma-separated file
test.to_csv('UMICH_SI650_Sentiment_Classification/testdata_predictions.csv', sep=',', index=False)
