## Import

In [105]:
import numpy as np
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [106]:
path_to_data = r"C:\Users\KIIT\OneDrive\Desktop\Twitter project\Twitter project\training.csv"


## Download Stopwords

In [107]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [108]:
import os
print("Current working directory:", os.getcwd())
print("Path variable:", path_to_data)
print("Exists:", os.path.exists(path_to_data))


Current working directory: c:\Users\KIIT\OneDrive\Desktop\Twitter project\Twitter project
Path variable: C:\Users\KIIT\OneDrive\Desktop\Twitter project\Twitter project\training.csv
Exists: True


In [109]:
# Stopwords in english
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

## Data Processing

In [110]:
twitter_data = pd.read_csv(path_to_data, encoding='ISO-8859-1')

In [111]:
twitter_data.shape

(1599999, 6)

In [112]:
twitter_data.head(10)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
5,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
6,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
7,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
8,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?
9,0,1467812416,Mon Apr 06 22:20:16 PDT 2009,NO_QUERY,erinx3leannexo,spring break in plain city... it's snowing


We see, the column names are not defined. So we name the columns

In [113]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']

In [114]:
twitter_data = pd.read_csv(path_to_data, names=column_names, encoding='ISO-8859-1')

In [115]:
twitter_data.shape

(1600000, 6)

In [116]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


### Clean the dataset, and perform other preprocessing steps

In [117]:
twitter_data.isnull().sum() #if yes, remove with dropna function

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [118]:
twitter_data.info() # to check for data types and ingormation about the features/columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [119]:
# For the target data we check its distribution
twitter_data['target'].value_counts() # no of unique values

target
0    800000
4    800000
Name: count, dtype: int64

Convert the target 4 to '1'

* 0 -> Negative tweet
* 1 -> Positive tweet

In [120]:
twitter_data.replace({'target':{4:1}}, inplace=True)

In [121]:
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [122]:
twitter_data.describe() # only on numerical features

Unnamed: 0,target,id
count,1600000.0,1600000.0
mean,0.5,1998818000.0
std,0.5,193576100.0
min,0.0,1467810000.0
25%,0.0,1956916000.0
50%,0.5,2002102000.0
75%,1.0,2177059000.0
max,1.0,2329206000.0


## Stemming

Stemming is the process of reducing a word to its Root word

In [123]:
port_stem = PorterStemmer()

In [124]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [73]:
import pandas as pd
from nltk.stem.porter import PorterStemmer
import re

# Initialize stemmer
stemmer = PorterStemmer()

# Sample twitter_data
# twitter_data = pd.DataFrame({'text': ["I love Python!", None, "Bad tweet ðŸ˜¡", 123, "Happy coding!"]})

# Function to safely stem text
def stemming(text):
    # Ensure input is string
    if not isinstance(text, str):
        text = ""
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Lowercase and split
    words = text.lower().split()
    # Stem each word
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

# Fill NaN with empty string (optional, just in case)
twitter_data['text'] = twitter_data['text'].fillna("")

# Apply the stemming function to the column
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

# Display first 5 rows
print(twitter_data[['text', 'stemmed_content']].head())


                                                text  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1  is upset that he can't update his Facebook by ...   
2  @Kenichan I dived many times for the ball. Man...   
3    my whole body feels itchy and like its on fire    
4  @nationwideclass no, it's not behaving at all....   

                                     stemmed_content  
0  switchfoot http twitpic com y zl awww that s a...  
1  is upset that he can t updat hi facebook by te...  
2  kenichan i dive mani time for the ball manag t...  
3       my whole bodi feel itchi and like it on fire  
4  nationwideclass no it s not behav at all i m m...  


### Save this new dataset to a file, so that you don't have to wait again for stemmed content

In [74]:
# twitter_data.to_csv("twitter_stemmed.csv")

In [75]:
import pandas as pd

file_path = "twitter_stemmed.csv"

for enc in ["utf-8", "latin1", "ISO-8859-1", "cp1252", "utf-8-sig"]:
    try:
        twitter_data = pd.read_csv(file_path, encoding=enc)
        print(f"Loaded successfully using: {enc}")
        break
    except Exception as e:
        print(f"Failed with {enc}: {e}")


Loaded successfully using: utf-8


In [76]:
twitter_data = pd.read_csv("twitter_stemmed.csv", encoding='ISO-8859-1')
twitter_data.head()

Unnamed: 0.1,Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [77]:
df = pd.read_csv("twitter_stemmed.csv", encoding="latin1", names=[
    "target","id","date","flag","user","text","stemmed_content"
])


  df = pd.read_csv("twitter_stemmed.csv", encoding="latin1", names=[


In [78]:
twitter_data.shape

(1600000, 8)

In [79]:
twitter_data = twitter_data.dropna()

In [80]:
twitter_data['stemmed_content'].isnull().sum()

0

In [81]:
print(twitter_data['stemmed_content'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1599505, dtype: object


In [82]:
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1599505, dtype: int64


In [83]:
# Separating the data and label
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [84]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [85]:
print(Y)

[0 0 0 ... 1 1 1]


Splitting data into training and test data

In [86]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [87]:
print(X.shape, X_train.shape, X_test.shape)

(1599505,) (1279604,) (319901,)


In [88]:
print(X_train)

['retrorewind great show today' 'still wide awak good' 'raphi hai dear'
 ... 'buildabear yeah tri yet kinda came oven n thank love seri much'
 'feel well today im sick hate flu even breath well urgghh'
 'read http bit ly ckw cool know guy chat']


In [89]:
print(X_test)

['wow rain day sick' 'watch pineappl express' 'josh shep mate' ...
 'love day back work tomorrw thou' 'papercak might throw traffic'
 'adzyuk might nce cheap addit em look like major airtim']


## Converting the textual data into numerical data

In [90]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [91]:
print(X_train)

  (0, 338364)	0.7628381862700486
  (0, 152969)	0.3776275607531222
  (0, 365681)	0.41460113464293163
  (0, 410810)	0.3218403736026031
  (1, 384003)	0.3691429868522367
  (1, 441346)	0.6830798699695361
  (1, 28692)	0.5483731943180142
  (1, 150491)	0.3105192204255133
  (2, 332366)	0.7657519561732702
  (2, 157423)	0.5219552654647517
  (2, 95030)	0.37574811040167677
  (3, 80636)	0.3835292297173858
  (3, 135944)	0.4063031361036051
  (3, 445011)	0.2711595990651876
  (3, 120080)	0.293733848490576
  (3, 226721)	0.3566587252607362
  (3, 285969)	0.188883972260186
  (3, 67987)	0.24270801629409655
  (3, 295762)	0.29911337364690993
  (3, 301319)	0.17828517423734444
  (3, 13721)	0.2401642375246812
  (3, 170856)	0.16812262000788783
  (3, 143888)	0.3146866968657337
  (4, 178224)	0.43523912206826365
  (4, 320766)	0.24515767832411012
  :	:
  (1279601, 217405)	0.2861389702815746
  (1279601, 359279)	0.34642256117664655
  (1279601, 454001)	0.2515835426017201
  (1279601, 59311)	0.28600231966534617
  (1279601,

In [92]:
print(X_test)

  (0, 93609)	0.3525327427398741
  (0, 331177)	0.5285068720861447
  (0, 366583)	0.5290071602322319
  (0, 445699)	0.5626300524119296
  (1, 125925)	0.6386348593386298
  (1, 315957)	0.6943652281259849
  (1, 436124)	0.3316661671124705
  (2, 202339)	0.5218997677152007
  (2, 254333)	0.46360317780255983
  (2, 363448)	0.7160256461818862
  (3, 81931)	0.18427236505907085
  (3, 124327)	0.22368189134744526
  (3, 132059)	0.19573372235668549
  (3, 150112)	0.31563953896135527
  (3, 234133)	0.13852179207400306
  (3, 240023)	0.16385714815543026
  (3, 261781)	0.3112271791072081
  (3, 315052)	0.4746997405440727
  (3, 322467)	0.31169944506506464
  (3, 354584)	0.3325808954828312
  (3, 386695)	0.3824810184866783
  (3, 407500)	0.23616121648171529
  (4, 78604)	0.24916309279898022
  (4, 163551)	0.3195324890237502
  (4, 170856)	0.2320872835854368
  :	:
  (319896, 257907)	0.26365675745550726
  (319896, 302640)	0.2943213284757908
  (319896, 334331)	0.31753515341437183
  (319896, 372962)	0.5397364115985864
  (31989

## Training the Machine Learning model

### Logistic Regression

In [93]:
model = LogisticRegression(max_iter=1000)

In [94]:
model.fit(X_train, Y_train) # model is trained

## Model Evaluation

### Accuracy Score

In [95]:
# Accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [96]:
print("Accuracy score on the training data: ", training_data_accuracy)

Accuracy score on the training data:  0.8024216867093257


In [97]:
# Accuracy score on training data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [98]:
print("Accuracy score on the test data: ", test_data_accuracy)

Accuracy score on the test data:  0.7783220433821714


## Saving the trained model

In [99]:
import pickle

In [100]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [101]:
filename = 'vectorizer.pkl'
pickle.dump(vectorizer, open(filename, 'wb'))

## Using the saved model for future predictions

In [102]:
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [103]:
X_new = X_test[200]
print(Y_test[200])

prediction = model.predict(X_new)
print(prediction)

if (prediction[0] == 0):
  print("Negative Tweet")
else:
  print("Positive Tweet")

1
[1]
Positive Tweet


In [104]:
sample_tweet = ["Just finished planting my first ever vegetable garden! So excited to watch it grow and enjoy homegrown veggies #sustainableliving #growyourown", "Stuck in the worst traffic jam ever. Been here for an hour and haven't moved an inch. Is it too early for a glass of wine? #commuterlife #terribletraffic"]
sample_tweet = vectorizer.transform(sample_tweet)
output_label = model.predict(sample_tweet)
print(output_label)

for i in range(len(output_label)):
    if (output_label[i] == 0):
        print("Negative Tweet")
    else:
        print("Positive Tweet")

[1 0]
Positive Tweet
Negative Tweet
