# Initial Exploration

In [None]:
import pandas as pd # importing pandas library

In [14]:
df = pd.read_csv('./datasets/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None) # reading the csv file

In [15]:
print(df.head())

   0           1                             2         3                4  \
0  0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  _TheSpecialOne_   
1  0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY    scotthamilton   
2  0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY         mattycus   
3  0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          ElleCTF   
4  0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY           Karoli   

                                                   5  
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1  is upset that he can't update his Facebook by ...  
2  @Kenichan I dived many times for the ball. Man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  


In [18]:
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text'] # assigning column names
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   ids     1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB
None


In [19]:
df = df[['target', 'text']] # selecting only target and text columns
print(df.head())

   target                                               text
0       0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1       0  is upset that he can't update his Facebook by ...
2       0  @Kenichan I dived many times for the ball. Man...
3       0    my whole body feels itchy and like its on fire 
4       0  @nationwideclass no, it's not behaving at all....


In [21]:
df['target'].value_counts() # counting occurrences of each unique row

target
0    800000
4    800000
Name: count, dtype: int64

# Data Cleaning And Pre-Processing

In [25]:
import re
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

def clean_text(text):
    text = text.lower() # convert to lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # remove URLs
    text = re.sub(r'@\w+', '', text) # remove mentions
    text = re.sub(r'#(\w+)', r'\1', text)  # remove hashtags but keep the text
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
    text = re.sub(r'\d+', '', text) # remove numbers
    text = text.strip() # remove leading and trailing whitespace
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space

    tokens = nltk.word_tokenize(text) # tokenize the text
    tokens = [word for word in tokens if word not in nltk.corpus.stopwords.words('english')] # remove stopwords
    tokens = [nltk.stem.PorterStemmer().stem(word) for word in tokens] # stemming
    text = ' '.join(tokens) # join tokens back to string
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shozab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shozab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Shozab\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [26]:
df['cleaned_text'] = df['text'].apply(clean_text) # applying the cleaning function to the text column

In [27]:
df = df.drop('text', axis=1) # dropping the original text column
df.to_csv('cleaned_tweets.csv', index=False) # saving the cleaned data to a new csv file

In [28]:
cleaned_df = pd.read_csv('./datasets/cleaned_tweets.csv') # reading the cleaned data
print(cleaned_df.head())

   target                                       cleaned_text
0       0       that bummer shoulda got david carr third day
1       0  upset cant updat facebook text might cri resul...
2       0       dive mani time ball manag save rest go bound
3       0                    whole bodi feel itchi like fire
4       0                              behav im mad cant see


# Model Training, Evaluation, and Saving

In [33]:
# Rename target column to sentiment and map values
df['sentiment'] = df['target'].map({0: 'negative', 4: 'positive'})
df = df.drop('target', axis=1)
print(df.head())

                                        cleaned_text sentiment
0       that bummer shoulda got david carr third day  negative
1  upset cant updat facebook text might cri resul...  negative
2       dive mani time ball manag save rest go bound  negative
3                    whole bodi feel itchi like fire  negative
4                              behav im mad cant see  negative


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [34]:
TfidfVectorizer = TfidfVectorizer(max_features=3000) # initializing the TF-IDF vectorizer

In [35]:
X = df['cleaned_text'] # features
y = df['sentiment'] # target variable

In [36]:
X_tfidf = TfidfVectorizer.fit_transform(X)

In [37]:
# To see the words the vectorizer chose as features:
print(TfidfVectorizer.get_feature_names_out()[:50]) # Print the first 50 words


['ab' 'abl' 'absolut' 'abt' 'ac' 'accent' 'accept' 'access' 'accid'
 'accident' 'accomplish' 'accord' 'account' 'ace' 'ach' 'across' 'act'
 'action' 'activ' 'actor' 'actual' 'ad' 'adam' 'add' 'addict' 'address'
 'admit' 'adopt' 'ador' 'adult' 'advanc' 'adventur' 'advertis' 'advic'
 'afford' 'afraid' 'afternoon' 'afterward' 'age' 'ago' 'agre' 'ah' 'aha'
 'ahah' 'ahaha' 'ahead' 'ahh' 'ahhh' 'ahhhh' 'ahhhhh']


In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42) # splitting the data into training and testing sets

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


## Model A (Baseline): Train a LogisticRegression Model

In [41]:
# Initialize the Logistic Regression model
log_reg_model = LogisticRegression(random_state=42)

print("Training the Logistic Regression model...")

# Train the model on the training data
log_reg_model.fit(X_train, y_train)

print("Logistic Regression model trained successfully!")


Training the Logistic Regression model...
Logistic Regression model trained successfully!


In [42]:
# Make predictions on the test data
y_pred_log_reg = log_reg_model.predict(X_test)

# Calculate the accuracy
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f"Logistic Regression Accuracy: {accuracy_log_reg * 100:.2f}%")

# Print a detailed classification report
print("\nClassification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_log_reg))


Logistic Regression Accuracy: 76.77%

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

    negative       0.78      0.75      0.76    159494
    positive       0.76      0.79      0.77    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



## Model B (Advanced): Train a RandomForestClassifier Model

In [44]:
# Initialize the Random Forest Classifier model
# n_estimators=100 means it will build 100 decision trees.
# n_jobs=-1 uses all available CPU cores to make training faster.
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

print("\nTraining the Random Forest model... (This may take several minutes)")

# Train the model on the training data
rf_model.fit(X_train, y_train)

print("Random Forest model trained successfully!")




Training the Random Forest model... (This may take several minutes)
Random Forest model trained successfully!


In [45]:
# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# Calculate the accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")

# Print a detailed classification report
print("\nClassification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 76.89%

Classification Report for Random Forest:
              precision    recall  f1-score   support

    negative       0.77      0.77      0.77    159494
    positive       0.77      0.77      0.77    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



In [47]:
import joblib

# --- Step 1: Explicitly choose the Logistic Regression model ---
# Based on our analysis, it provides the same performance with better speed.
best_model = log_reg_model
print(f"Selected model for saving: {type(best_model).__name__}")


# --- Step 2: Define the filepaths ---
model_filepath = './models/sentiment_model.pkl'
vectorizer_filepath = './models/vectorizer.pkl'


# --- Step 3: Save both the model and the vectorizer ---
# Save the trained model to a file
joblib.dump(best_model, model_filepath)
print(f"Model saved successfully to {model_filepath}")

# Save the TF-IDF vectorizer to a file
# (This is the same vectorizer you used for both models)
joblib.dump(TfidfVectorizer, vectorizer_filepath)
print(f"Vectorizer saved successfully to {vectorizer_filepath}")



Selected model for saving: LogisticRegression
Model saved successfully to ./models/sentiment_model.pkl
Vectorizer saved successfully to ./models/vectorizer.pkl
