In [1]:
import pandas as pd

# Specify encoding
train_df = pd.read_csv('train.csv', encoding='ISO-8859-1')
test_df = pd.read_csv('test.csv', encoding='ISO-8859-1')
manual_test_df = pd.read_csv('testdata.manual.2009.06.14.csv', encoding='ISO-8859-1')
large_train_df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

# Explore the datasets
print("Train Data:")
print(train_df.head())
print(train_df.info())

print("\nTest Data:")
print(test_df.head())
print(test_df.info())

print("\nManual Test Data:")
print(manual_test_df.head())
print(manual_test_df.info())

print("\nLarge Train Data:")
print(large_train_df.head())
print(large_train_df.info())

# Check sentiment distribution in train and test datasets
print("\nTrain Data Sentiment Distribution:")
print(train_df['sentiment'].value_counts())

print("\nTest Data Sentiment Distribution:")
print(test_df['sentiment'].value_counts())

Train Data:
       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment Time of Tweet Age of User  \
0  I`d have responded, if I were going   neutral       morning        0-20   
1                             Sooo SAD  negative          noon       21-30   
2                          bullying me  negative         night       31-45   
3                       leave me alone  negative       morning       46-60   
4                        Sons of ****,  negative          noon       60-70   

       Country  Population -2020  Land Area (Km²)  Density (P/Km²)  
0  Afghanistan          38928346         

In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK data
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function for preprocessing text
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    words = word_tokenize(text)  # Tokenization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization and stop word removal
    return ' '.join(words)

# Load datasets
train_df = pd.read_csv('train.csv', encoding='ISO-8859-1')
test_df = pd.read_csv('test.csv', encoding='ISO-8859-1')

# Fill missing values in 'text' column with an empty string
train_df['text'].fillna('', inplace=True)
test_df['text'].fillna('', inplace=True)

# Apply preprocessing to train and test data
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)

# Check the results
print("Preprocessed Train Data:")
print(train_df[['text', 'cleaned_text']].head())
print("\nPreprocessed Test Data:")
print(test_df[['text', 'cleaned_text']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prabh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prabh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prabh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessed Train Data:
                                                text  \
0                I`d have responded, if I were going   
1      Sooo SAD I will miss you here in San Diego!!!   
2                          my boss is bullying me...   
3                     what interview! leave me alone   
4   Sons of ****, why couldn`t they put them on t...   

                             cleaned_text  
0                      id responded going  
1                 sooo sad miss san diego  
2                            bos bullying  
3                   interview leave alone  
4  son couldnt put release already bought  

Preprocessed Test Data:
                                                text  \
0  Last session of the day  http://twitpic.com/67ezh   
1   Shanghai is also really exciting (precisely -...   
2  Recession hit Veronique Branquinho, she has to...   
3                                        happy bday!   
4             http://twitpic.com/4w75p - I like it!!   

            

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Vectorize the text data using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(train_df['cleaned_text'])
X_test_tfidf = tfidf.transform(test_df['cleaned_text'])

# Extract labels
y_train = train_df['sentiment']
y_test = test_df['sentiment']

# Split the train data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Print the shape of the datasets
print(f"Training Data Shape: {X_train.shape}")
print(f"Validation Data Shape: {X_val.shape}")
print(f"Test Data Shape: {X_test_tfidf.shape}")

Training Data Shape: (21984, 5000)
Validation Data Shape: (5497, 5000)
Test Data Shape: (4815, 5000)


In [8]:
# Train a Naive Bayes classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Predict and evaluate on validation data
y_val_pred_nb = nb.predict(X_val)
print("Naive Bayes Validation Accuracy:", accuracy_score(y_val, y_val_pred_nb))
print(classification_report(y_val, y_val_pred_nb))

# Assuming `test_df` is your test DataFrame with the necessary columns
# Define `y_test` from the `test_df` if not already done
y_test = test_df['sentiment']

# Ensure `y_test` is a pandas Series of strings (sentiment labels)
y_test = y_test.astype(str)

# Check unique values in y_test to ensure correctness
print("Unique values in y_test:", y_test.unique())

# Now you can proceed with evaluating the model predictions
y_test_pred_nb = nb.predict(X_test_tfidf)

# Convert predictions to strings if necessary
y_test_pred_nb = y_test_pred_nb.astype(str)

# Evaluate the Naive Bayes model on test data
print("Naive Bayes Test Accuracy:", accuracy_score(y_test, y_test_pred_nb))
print(classification_report(y_test, y_test_pred_nb))

Naive Bayes Validation Accuracy: 0.6359832635983264
              precision    recall  f1-score   support

    negative       0.74      0.49      0.59      1562
     neutral       0.56      0.76      0.65      2230
    positive       0.72      0.60      0.66      1705

    accuracy                           0.64      5497
   macro avg       0.67      0.62      0.63      5497
weighted avg       0.66      0.64      0.63      5497

Unique values in y_test: ['neutral' 'positive' 'negative' 'nan']
Naive Bayes Test Accuracy: 0.4664589823468328
              precision    recall  f1-score   support

         nan       0.00      0.00      0.00      1281
    negative       0.72      0.52      0.60      1001
     neutral       0.33      0.75      0.46      1430
    positive       0.74      0.60      0.66      1103

    accuracy                           0.47      4815
   macro avg       0.45      0.47      0.43      4815
weighted avg       0.42      0.47      0.41      4815



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# Train an SVM classifier
svm = SVC()
svm.fit(X_train, y_train)

# Predict and evaluate on validation data
y_val_pred_svm = svm.predict(X_val)
print("SVM Validation Accuracy:", accuracy_score(y_val, y_val_pred_svm))
print(classification_report(y_val, y_val_pred_svm))

# Predict and evaluate on test data
y_test_pred_svm = svm.predict(X_test_tfidf)
print("SVM Test Accuracy:", accuracy_score(y_test, y_test_pred_svm))
print(classification_report(y_test, y_test_pred_svm))

SVM Validation Accuracy: 0.6951064216845552
              precision    recall  f1-score   support

    negative       0.78      0.56      0.65      1562
     neutral       0.62      0.80      0.70      2230
    positive       0.78      0.68      0.73      1705

    accuracy                           0.70      5497
   macro avg       0.73      0.68      0.69      5497
weighted avg       0.71      0.70      0.69      5497

SVM Test Accuracy: 0.5111111111111111
              precision    recall  f1-score   support

         nan       0.00      0.00      0.00      1281
    negative       0.75      0.57      0.65      1001
     neutral       0.36      0.79      0.50      1430
    positive       0.81      0.69      0.74      1103

    accuracy                           0.51      4815
   macro avg       0.48      0.51      0.47      4815
weighted avg       0.45      0.51      0.45      4815



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Train a Logistic Regression classifier
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Predict and evaluate on validation data
y_val_pred_lr = lr.predict(X_val)
print("Logistic Regression Validation Accuracy:", accuracy_score(y_val, y_val_pred_lr))
print(classification_report(y_val, y_val_pred_lr))

# Predict and evaluate on test data
y_test_pred_lr = lr.predict(X_test_tfidf)
print("Logistic Regression Test Accuracy:", accuracy_score(y_test, y_test_pred_lr))
print(classification_report(y_test, y_test_pred_lr))

Logistic Regression Validation Accuracy: 0.6825541204293251
              precision    recall  f1-score   support

    negative       0.73      0.59      0.65      1562
     neutral       0.62      0.75      0.68      2230
    positive       0.76      0.69      0.72      1705

    accuracy                           0.68      5497
   macro avg       0.70      0.67      0.68      5497
weighted avg       0.69      0.68      0.68      5497

Logistic Regression Test Accuracy: 0.5044652128764279
              precision    recall  f1-score   support

         nan       0.00      0.00      0.00      1281
    negative       0.70      0.61      0.66      1001
     neutral       0.35      0.74      0.48      1430
    positive       0.79      0.69      0.74      1103

    accuracy                           0.50      4815
   macro avg       0.46      0.51      0.47      4815
weighted avg       0.43      0.50      0.45      4815



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0.01, 0.1, 1]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters for Naive Bayes:", grid_search.best_params_)

Best parameters for Naive Bayes: {'alpha': 1}


In [13]:
from sklearn.model_selection import cross_val_score

nb_best_model = MultinomialNB(alpha=grid_search.best_params_['alpha'])
scores = cross_val_score(nb_best_model, X_train, y_train, cv=5)
print("Cross-Validation Scores:", scores)
print("Mean CV Score:", scores.mean())

Cross-Validation Scores: [0.62701842 0.62656357 0.63133955 0.63179441 0.62875341]
Mean CV Score: 0.6290938709762198
