<a href="https://colab.research.google.com/github/samObot19/Assesment/blob/main/ml_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load the datasets

In [2]:
import pandas as pd

file_path = '/content/drive/MyDrive/datasets/twitter_training.csv'

# Load dataset
df = pd.read_csv(file_path, header=None, names=["ID", "Topic", "Sentiment", "Tweet"])
df["Tweet"] = df["Tweet"].str.strip()

df.head()


Unnamed: 0,ID,Topic,Sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


# Drop unnecessary columns (like ID and Topic)


In [3]:
df = df.drop(columns=["ID", "Topic"])
print(df.columns)


Index(['Sentiment', 'Tweet'], dtype='object')


# Text preprocessing

In [4]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = str(text)
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#', '', text)  # remove @mentions and hashtags
    text = re.sub(r"[^a-zA-Z]", " ", text)  # remove numbers/symbols
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

df['Cleaned_Tweet'] = df['Tweet'].apply(preprocess)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
df.head(30)

Unnamed: 0,Sentiment,Tweet,Cleaned_Tweet
0,Positive,im getting on borderlands and i will murder yo...,getting borderland murder
1,Positive,I am coming to the borders and I will kill you...,coming border kill
2,Positive,im getting on borderlands and i will kill you ...,getting borderland kill
3,Positive,im coming on borderlands and i will murder you...,coming borderland murder
4,Positive,im getting on borderlands 2 and i will murder ...,getting borderland murder
5,Positive,im getting into borderlands and i can murder y...,getting borderland murder
6,Positive,So I spent a few hours making something for fu...,spent hour making something fun know huge bord...
7,Positive,So I spent a couple of hours doing something f...,spent couple hour something fun know huge bord...
8,Positive,So I spent a few hours doing something for fun...,spent hour something fun know huge borderland ...
9,Positive,So I spent a few hours making something for fu...,spent hour making something fun know huge rhan...


# Encode Sentiment Labels

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Label'] = le.fit_transform(df['Sentiment'])  # Positive = 1, Negative = 0, etc.
df.head(20)

Unnamed: 0,Sentiment,Tweet,Cleaned_Tweet,Label
0,Positive,im getting on borderlands and i will murder yo...,getting borderland murder,3
1,Positive,I am coming to the borders and I will kill you...,coming border kill,3
2,Positive,im getting on borderlands and i will kill you ...,getting borderland kill,3
3,Positive,im coming on borderlands and i will murder you...,coming borderland murder,3
4,Positive,im getting on borderlands 2 and i will murder ...,getting borderland murder,3
5,Positive,im getting into borderlands and i can murder y...,getting borderland murder,3
6,Positive,So I spent a few hours making something for fu...,spent hour making something fun know huge bord...,3
7,Positive,So I spent a couple of hours doing something f...,spent couple hour something fun know huge bord...,3
8,Positive,So I spent a few hours doing something for fun...,spent hour something fun know huge borderland ...,3
9,Positive,So I spent a few hours making something for fu...,spent hour making something fun know huge rhan...,3


# TF-IDF Vectorization

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['Cleaned_Tweet']).toarray()
y = df['Label']


# Split Data

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#  ML Model (Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Set l1_ratio to a value between 0 and 1 when using elasticnet penalty
lr = LogisticRegression(max_iter=200, solver='saga', penalty='elasticnet', l1_ratio=0.5, class_weight='balanced')
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test)

print("Logistic Regression:")
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))
