## Load the Dataset

In [38]:
import pandas as pd
pf=pd.read_csv("/content/sample_data/Tweets.csv")
pf.head(5)
pf.shape

(14640, 15)

In [39]:
df = pf[["airline_sentiment", "text"]]

In [40]:
df.head(10)

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
5,negative,@VirginAmerica seriously would pay $30 a fligh...
6,positive,"@VirginAmerica yes, nearly every time I fly VX..."
7,neutral,@VirginAmerica Really missed a prime opportuni...
8,positive,"@virginamerica Well, I didn't…but NOW I DO! :-D"
9,positive,"@VirginAmerica it was amazing, and arrived an ..."


# Preprocess Text

In [41]:
import nltk
import string
import re
from nltk.stem import PorterStemmer
import html
!pip install emoji
import emoji

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.corpus import stopwords

ps = PorterStemmer()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [42]:
def clean_text(text):
    text = html.unescape(text)
    text = text.encode('utf-8').decode('utf-8')
    text = emoji.demojize(text)
    text = text.lower()
    text = re.sub(r'[^\w\s@#]', '', text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["text_cleaned"] = df["text"].apply(clean_text)
print(df.head(100))

   airline_sentiment                                               text  \
0            neutral                @VirginAmerica What @dhepburn said.   
1           positive  @VirginAmerica plus you've added commercials t...   
2            neutral  @VirginAmerica I didn't today... Must mean I n...   
3           negative  @VirginAmerica it's really aggressive to blast...   
4           negative  @VirginAmerica and it's a really big bad thing...   
..               ...                                                ...   
95          negative  @VirginAmerica Is it me, or is your website do...   
96          negative  @VirginAmerica I can't check in or add a bag. ...   
97          negative  @VirginAmerica - Let 2 scanned in passengers l...   
98          negative  @virginamerica What is your phone number. I ca...   
99          negative  @VirginAmerica is anyone doing anything there ...   

                                         text_cleaned  
0                     @ virginamerica @ dhe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_cleaned"] = df["text"].apply(clean_text)


# Feature Extraction

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['text_cleaned']).toarray()
Y = df['airline_sentiment'].values
print(X.shape)



(14640, 3000)


# Train_Model

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Multinomial Naive Bayes Accuracy:", accuracy_nb)

rf_model = RandomForestClassifier(random_state=2, n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

Multinomial Naive Bayes Accuracy: 0.7315573770491803
Random Forest Accuracy: 0.7482923497267759


In [48]:
df.groupby('airline_sentiment').describe()


Unnamed: 0_level_0,text,text,text,text,text_cleaned,text_cleaned,text_cleaned,text_cleaned
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
airline_sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
negative,9178,9087,@AmericanAir robocalls me with another Cancell...,2,9178,9077,@ unit suck,3
neutral,3099,3067,@SouthwestAir sent,5,3099,3029,@ unit thank,9
positive,2363,2298,@united thanks,5,2363,2210,@ unit thank,28


# Save the model

In [45]:
import joblib

joblib.dump(nb_model, 'nb_model.pkl')
joblib.dump(rf_model, 'rf_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("✅ Models and vectorizer saved successfully!")
import os
print(os.listdir())


✅ Models and vectorizer saved successfully!
['.config', 'rf_model.pkl', 'tfidf_vectorizer.pkl', 'nb_model.pkl', 'sample_data']
