# Initilization

In [2]:
"""
DATA COLLECTION
"""

import requests, zipfile, io
url = 'https://github.com/rgap/NMA-Twitter-SentimentAnalysis/raw/main/data/raw/trainingandtestdata.zip'
raw_data_directory = 'data/raw/'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(path=raw_data_directory)


# Read the dataset

import pandas as pd
raw_data_directory = 'data/raw/'
# We load the dataset (THIS WILL USE ONLY THE TRAINING DATASET)
header_list = ["polarity", "id", "date", "query", "user", "text"]
df = pd.read_csv(raw_data_directory + 'training.1600000.processed.noemoticon.csv',
                 encoding = "ISO-8859-1", names=header_list)
# Let's have a look at it
df.head(2)

"""
DATA PREPROCESSING
"""

# Features
features = ['id', 'date', 'query', 'user', 'text']
# Target
target = 'polarity'

# Transform the polarity column into just 0s and 1s because it has only 2 unique values and the column type should be int
df['polarity'] = df['polarity'].apply(lambda x: 0 if x == 0 else 1)

# 1. Split data into train and test set

We analyze only text data, so X will be df['text']

In [3]:
from sklearn.model_selection import train_test_split

X = df.text.values
y = df.polarity.values

# Split the data into train and test
x_train_text, x_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [23]:
# check the size of the train and test data
print("Train data size: ", len(x_train_text))
print("Test data size: ", len(x_test_text))

Train data size:  1280000
Test data size:  320000


# 2. Feature Engineering

In [25]:
# We use CountVectorizer to convert the text into a matrix of token counts
# For example if we have the following tweets:
# "I am learning NLP"
# "NLP is fun"
# The CountVectorizer will convert it into:

# [[1, 1, 1, 1, 0],
#  [0, 1, 0, 1, 1]]

# The columns are the unique words in the text
# The values are the count of the word in the text
# But it will be in a sparse matrix format. Then it will look like

# This will be the first tweet
#   (0, 0)	1
#   (0, 1)	1
#   (0, 2)	1
#   (0, 3)	1

# This will be the second tweet
#   (1, 1)	1
#   (1, 3)	1
#   (1, 4)	1

# The first column is the row index, the second column is the column index, and the third column is the value
# We can convert it to a dense matrix using toarray() method

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(x_train_text)
x_train_count_vectorizer = vectorizer.transform(x_train_text)

In [26]:
# print the shape of the matrix
print(x_train_count_vectorizer.shape)
# the shape is (1280000, 589260) which means we have 1280000 tweets and 589260 unique words

(1280000, 589260)


In [28]:
# print the first row before converting it to a dense matrix
print(x_train_text[0])

@paisleypaisley LOL why do i get ideas so far in advance? it's not even june yet! we need a third knitter to have our own summer group 


In [27]:
# print the first row of the matrix  
print(x_train_count_vectorizer[0])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 25 stored elements and shape (1, 589260)>
  Coords	Values
  (0, 44017)	1
  (0, 165048)	1
  (0, 187921)	1
  (0, 193153)	1
  (0, 213818)	1
  (0, 223808)	1
  (0, 231133)	1
  (0, 247732)	1
  (0, 251948)	1
  (0, 257727)	1
  (0, 283262)	1
  (0, 300176)	1
  (0, 323145)	1
  (0, 379546)	1
  (0, 389097)	1
  (0, 400698)	1
  (0, 401872)	1
  (0, 403800)	1
  (0, 484798)	1
  (0, 501642)	1
  (0, 523180)	1
  (0, 528584)	1
  (0, 559664)	1
  (0, 564428)	1
  (0, 580381)	1
[[0 0 0 ... 0 0 0]]


# 3. Model Selection

Our goal is to train a model capable of estimating the sentiment (POLARITY) of a tweet: 0 or 1

So we need a binary classifier.

### Testing a Logistic Regression Model

In [29]:
# Testing a logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

# Create a pipeline
model = make_pipeline(CountVectorizer(), LogisticRegression())
# Fit the model
model.fit(x_train_text, y_train)
# Predict the test data
y_pred = model.predict(x_test_text)
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy:  0.797578125


In [31]:
# Checking the classification_report but with precision of 4
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, digits=4))

# The accuracy is 0.79 which is not bad for a simple model
# But we can improve it by using a more complex model like a
# neural network or by tuning the hyperparameters of the model

              precision    recall  f1-score   support

           0     0.8086    0.7798    0.7939    160000
           1     0.7873    0.8154    0.8011    160000

    accuracy                         0.7976    320000
   macro avg     0.7980    0.7976    0.7975    320000
weighted avg     0.7980    0.7976    0.7975    320000



### Explaining the model results

In [39]:
feature_names = model.named_steps['countvectorizer'].get_feature_names_out()
# Get the coefficients of the model
coefs = model.named_steps['logisticregression'].coef_[0]
# Create a dataframe of the coefficients
df_coefs = pd.DataFrame({'word': feature_names, 'coef': coefs})
# Sort the dataframe by the coefficients
df_coefs = df_coefs.sort_values('coef')
# Print the top 10 words that have the highest coefficients
print(df_coefs.tail(10))
# Print the top 10 words that have the lowest coefficients
print(df_coefs.head(10))



                   word      coef
483052           smiles  2.429968
371747      musicmonday  2.487605
579089             yayy  2.520072
580439              yey  2.590780
569457          worries  2.657036
91247           blessed  2.727705
417326         pleasure  2.789915
579106            yayyy  2.799602
134081  congratulations  3.041287
483142          smiling  3.289673
                 word      coef
161467  disappointing -4.279824
193912        fathers -3.994550
104468         bummed -3.831200
452580         ruined -3.804651
357600         missin -3.749923
455520        sadness -3.700784
156551     depressing -3.694853
225549         gutted -3.651268
207419        funeral -3.631763
156541      depressed -3.610566


In [None]:
# this means that the word "smiles" has the highest coefficient and the word "depressed" has the lowest coefficient
# which means that the word "smiles" is more likely to be in a positive tweet and the word "depressed" is more likely to be in a negative tweet
# We can use these coefficients to understand the model better
# The word "smiles" has the highest coefficient because it is more likely to be in a positive tweet
# The word "depressed" has the lowest coefficient because it is more likely to be in a negative tweet