# Import Libraries

In [1]:
import numpy as np
import pandas as pd

# Load the Dataset

In [10]:
encodings=["utf-8","latin1","ISO-8859-1","cp1252"]
file_path="test.csv"
for encoding in encodings:
    try:
        dataset=pd.read_csv(file_path,encoding=encoding)
        print(f"file read with encoding:{encoding}")
        break
    except UnicodeDecodeError:
        print(f"failed to read with encoding:{encoding}")
        continue
if "dataset" in locals():
    print("successfully loaded")
else:
    print("failed to load")

failed to read with encoding:utf-8
file read with encoding:latin1
successfully loaded


In [12]:
dataset.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [27]:
dataset.isnull().sum()

textID               0
text                 0
sentiment            0
Time of Tweet        0
Age of User          0
Country              0
Population -2020     0
Land Area (Km²)      0
Density (P/Km²)      0
cleaned_text         0
preprocessed_text    0
dtype: int64

In [28]:
dataset.fillna("", inplace=True)

In [29]:
# Check distribution of sentiments
print("\nSentiment Distribution:")
print(dataset['sentiment'].value_counts())


Sentiment Distribution:
sentiment
neutral     1430
            1281
positive    1103
negative    1001
Name: count, dtype: int64


# Data Preprocessing

In [30]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
# Apply text cleaning and preprocessing
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
        text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
        text = text.lower()  # Convert text to lowercase
        return text
    else:
        return ''

def preprocess_text(text):
    if isinstance(text, str):
        words = nltk.word_tokenize(text)  # Tokenization
        words = [word for word in words if word not in stop_words]  # Remove stopwords
        return ' '.join(words)
    else:
        return ''


In [32]:
dataset['cleaned_text'] = dataset['text'].apply(clean_text)
dataset['preprocessed_text'] = dataset['cleaned_text'].apply(preprocess_text)

# Feature Extraction

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text data into numerical feature vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X = tfidf_vectorizer.fit_transform(dataset['preprocessed_text'])
y = dataset['sentiment']

# Model Building

In [47]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [35]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
from sklearn.ensemble import RandomForestClassifier
# Define the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

In [42]:
# Define the hyperparameters grid for grid search
param_grid_rf = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
}

In [45]:
# Define the hyperparameters grid for grid search
param_grid_rf = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
}

In [48]:
# Perform grid search to find the best hyperparameters
grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

In [49]:
# Get the best parameters found by grid search
best_params_rf = grid_search_rf.best_params_
print("Best Hyperparameters (Random Forest):", best_params_rf)

Best Hyperparameters (Random Forest): {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}


In [50]:
# Train the Random Forest classifier with the best hyperparameters
rf_classifier = RandomForestClassifier(**best_params_rf, random_state=42)
rf_classifier.fit(X_train, y_train)

In [51]:
# Predict sentiments using the trained Random Forest classifier
y_pred_rf = rf_classifier.predict(X_test)

# Model Evaluation 

In [52]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("\nRandom Forest Classifier Accuracy:", accuracy_rf)
print(classification_report(y_test, y_pred))


Random Forest Classifier Accuracy: 0.667705088265836
              precision    recall  f1-score   support

                   0.00      0.00      0.00       258
    negative       0.79      0.26      0.39       226
     neutral       0.29      0.75      0.42       268
    positive       0.74      0.61      0.67       211

    accuracy                           0.40       963
   macro avg       0.46      0.40      0.37       963
weighted avg       0.43      0.40      0.35       963



In [53]:
# Classification Report
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Random Forest Classification Report:
              precision    recall  f1-score   support

                   0.75      1.00      0.86       258
    negative       0.81      0.34      0.48       226
     neutral       0.51      0.64      0.57       268
    positive       0.73      0.64      0.68       211

    accuracy                           0.67       963
   macro avg       0.70      0.66      0.65       963
weighted avg       0.69      0.67      0.65       963



In [54]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[  0   0 258   0]
 [  8  58 149  11]
 [ 22  13 200  33]
 [  6   2  75 128]]
