In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
import re
import nltk

In [2]:
# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Extracting csv file from zip file

In [3]:
import zipfile
zip_file_path = r'c:\Users\admin\Downloads\training.1600000.processed.noemoticon.csv.zip'
csv_file_output_path = r'c:\Users\admin\Downloads\training_processed_noemoticon.csv'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    csv_file_name = zip_ref.namelist()[0]  
    df = pd.read_csv(zip_ref.open(csv_file_name), encoding='latin-1', header=None)
    df.to_csv(csv_file_output_path, index=False)

In [4]:
df

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [5]:
# Assign column names to the dataset
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

In [6]:
df

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


 Display the first few rows of the dataset


In [7]:
print(df.head())

   target         ids                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


 Keeping only the relevant columns: 'target' and 'text'


In [8]:
df = df[['target', 'text']]

 Map target values to 0 = negative, 1 = neutral, 2 = positive


In [9]:
df['target'] = df['target'].map({0: 0, 2: 1, 4: 2})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'] = df['target'].map({0: 0, 2: 1, 4: 2})


 Preprocessing function to clean tweets


In [10]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'@\w+', '', text)    # Remove mentions
    text = re.sub(r'#\w+', '', text)    # Remove hashtags
    text = re.sub(r'\d+', '', text)     # Remove numbers
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = text.lower()                 # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters

    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


    return text


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
df['text'] = df['text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)


In [12]:
df

Unnamed: 0,target,text
0,0,switchfoot httptwitpiccom2y1zl awww thats a b...
1,0,is upset that he cant update his facebook by t...
2,0,kenichan i dived many times for the ball manag...
3,0,my whole body feels itchy and like its on fire
4,0,nationwideclass no its not behaving at all im ...
...,...,...
1599995,2,just woke up having no school is the best feel...
1599996,2,thewdbcom very cool to hear old walt intervie...
1599997,2,are you ready for your mojo makeover ask me fo...
1599998,2,happy 38th birthday to my boo of alll time tup...


Getting neutral values

In [13]:
neutral_candidates = df[df['text'].str.contains(r'\b(no opinion|neutral|okay|fine)\b', case=False)]
print(neutral_candidates.head())

  neutral_candidates = df[df['text'].str.contains(r'\b(no opinion|neutral|okay|fine)\b', case=False)]


      target                                               text
7          0  loltrish hey  long time no see yes rains a bit...
424        0  treesahquiche okay about the applepears ive ta...
618        0  hannahsix cream for his eye and he may have he...
834        0                           okay so still no school 
1018       0  my dog cant move anymore praying that he will ...


In [14]:
df.loc[neutral_candidates.index, 'target'] = 1

In [15]:
print(df['target'].value_counts())  

target
0    795188
2    793014
1     11798
Name: count, dtype: int64


Spliting data

In [16]:
X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


 Verifyng class distribution in the train and test sets

In [17]:
print("Class distribution in the training set:")
print(y_train.value_counts())
print("Class distribution in the test set:")
print(y_test.value_counts())

Class distribution in the training set:
target
0    636151
2    634411
1      9438
Name: count, dtype: int64
Class distribution in the test set:
target
0    159037
2    158603
1      2360
Name: count, dtype: int64


 Converting text data to numerical using TF-IDF Vectorizer (Unigrams and Bigrams)


In [18]:
# Convert text data to numerical using TF-IDF Vectorizer (Unigrams and Bigrams)
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
# Hyperparameter tuning with GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'max_iter': [100, 200, 300]}
grid = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, cv=5)
grid.fit(X_train_tfidf, y_train)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200],          # Number of trees
    'max_depth': [10, 20, None],         # Maximum depth of trees
    'min_samples_split': [2, 5],         # Minimum samples required to split
    'min_samples_leaf': [1, 2]           # Minimum samples per leaf
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    rf, 
    param_distributions=param_grid, 
    n_iter=5, 
    cv=3, 
    scoring='accuracy', 
    random_state=42, 
    n_jobs=-1
)

# Fit the RandomizedSearchCV to the training data
random_search.fit(X_train_tfidf, y_train)

# Output the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validated Score:", random_search.best_score_)


In [None]:
# Use the best model from GridSearchCV
best_model = grid.best_estimator_

In [None]:
# Make predictions on the test set
y_pred = best_model.predict(X_test_tfidf)

In [None]:
# Evaluate the model
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validated Score:", grid.best_score_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, labels=[0, 1, 2], 
      target_names=['Negative', 'Neutral', 'Positive'], zero_division=1))


Best Parameters: {'C': 1, 'max_iter': 100}
Best Cross-Validated Score: 0.7970218750000001
Accuracy: 0.795965625
              precision    recall  f1-score   support

    Negative       0.80      0.78      0.79    159037
     Neutral       1.00      1.00      1.00      2360
    Positive       0.79      0.81      0.80    158603

    accuracy                           0.80    320000
   macro avg       0.86      0.86      0.86    320000
weighted avg       0.80      0.80      0.80    320000



Saving the files into pkl file 

In [None]:
import pickle
# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save the sentiment model
with open('sentiment_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)


In [None]:
import pickle

# Load the vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

# Load the sentiment model
with open('sentiment_model.pkl', 'rb') as f:
    sentiment_model = pickle.load(f)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming X_train and X_test contain raw text data, you need to vectorize them
vectorizer = TfidfVectorizer(stop_words='english')

vectorizer = TfidfVectorizer(max_features=5000)  # Ensure consistent number of features

# Fit and transform the training data, then transform the test data
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Predictions on the training and test sets using the best_model
train_pred = best_model.predict(X_train_vectorized)
test_pred = best_model.predict(X_test_vectorized)

# Calculate accuracy on both sets
train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)

# Print performance metrics
print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

# Evaluate using confusion matrix and classification report
print("Confusion Matrix (Test Set):")
print(confusion_matrix(y_test, test_pred))
print("Classification Report (Test Set):")
print(classification_report(y_test, test_pred))

# Check for overfitting or underfitting
if train_accuracy > test_accuracy:
    print("Possible overfitting: The model performs better on training data than on test data.")
elif train_accuracy < test_accuracy:
    print("Possible underfitting: The model performs poorly on both training and test data.")
else:
    print("Balanced model: The model performs well on both training and test data.")


Training Accuracy: 0.502046875
Test Accuracy: 0.50269375
Confusion Matrix (Test Set):
[[78413    42 80582]
 [ 1100     0  1260]
 [76098    56 82449]]
Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.50      0.49      0.50    159037
           1       0.00      0.00      0.00      2360
           2       0.50      0.52      0.51    158603

    accuracy                           0.50    320000
   macro avg       0.34      0.34      0.34    320000
weighted avg       0.50      0.50      0.50    320000

Possible underfitting: The model performs poorly on both training and test data.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=5, random_state=42, use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": lgb.LGBMClassifier(n_estimators=100, random_state=42)
}

# Vectorize the data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Store results
model_performance = {}

# Evaluate each model
for model_name, model in models.items():
    print(f"\n--- Training {model_name} ---")
    # Train the model
    model.fit(X_train_vectorized, y_train)
    
    # Predict on train and test sets
    train_pred = model.predict(X_train_vectorized)
    test_pred = model.predict(X_test_vectorized)
    
    # Calculate accuracies
    train_accuracy = accuracy_score(y_train, train_pred)
    test_accuracy = accuracy_score(y_test, test_pred)
    
    # Store results
    model_performance[model_name] = {
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy,
        "Confusion Matrix": confusion_matrix(y_test, test_pred),
        "Classification Report": classification_report(y_test, test_pred, output_dict=True)
    }
    
    # Print results
    print(f"Training Accuracy: {train_accuracy}")
    print(f"Test Accuracy: {test_accuracy}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, test_pred))
    print("Classification Report:")
    print(classification_report(y_test, test_pred))

# Identify the best model based on test accuracy
best_model_name = max(model_performance, key=lambda x: model_performance[x]['Test Accuracy'])
best_model_results = model_performance[best_model_name]

print("\n--- Best Model ---")
print(f"Model: {best_model_name}")
print(f"Training Accuracy: {best_model_results['Train Accuracy']}")
print(f"Test Accuracy: {best_model_results['Test Accuracy']}")
print("Confusion Matrix:")
print(best_model_results['Confusion Matrix'])
print("Classification Report:")
print(best_model_results['Classification Report'])



--- Training Logistic Regression ---
Training Accuracy: 0.770784375
Test Accuracy: 0.766396875
Confusion Matrix:
[[117454      0  41583]
 [     8   2348      4]
 [ 33158      0 125445]]
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76    159037
           1       1.00      0.99      1.00      2360
           2       0.75      0.79      0.77    158603

    accuracy                           0.77    320000
   macro avg       0.84      0.84      0.84    320000
weighted avg       0.77      0.77      0.77    320000


--- Training Random Forest ---
