In [1]:
# importing required libraries
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split


In [2]:
# read the dataset
data = pd.read_csv('data.csv')

In [3]:
# view the top rows
data.head()

Unnamed: 0,sentiment,text,user
0,neutral,RT @3novices: Angela Merkel calls Russian inva...,LuisaRagni
1,negative,RT @DominoDataLab: Learn from the creators of ...,XeronBot
2,negative,RT @DominoDataLab: Learn from the creators of ...,grantho
3,neutral,RT @ragipsoylu: Current control of territory i...,Data_Science_11
4,neutral,"Technically, if 442.2% of the population is in...",sporksys


In [4]:
data2 = pd.read_csv("archive/tweet_dataset.csv")

In [5]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         40000 non-null  int64 
 1   sentiment      40000 non-null  object
 2   author         40000 non-null  object
 3   text           39934 non-null  object
 4   old_text       40000 non-null  object
 5   aux_id         40000 non-null  object
 6   new_sentiment  31395 non-null  object
 7   selected_text  27767 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.4+ MB


In [6]:
data2.head()

Unnamed: 0,textID,sentiment,author,text,old_text,aux_id,new_sentiment,selected_text
0,1956967341,empty,xoshayzers,i know i was listenin to bad habit earlier a...,@tiffanylue i know i was listenin to bad habi...,p1000000000,,
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,Layin n bed with a headache ughhhh...waitin o...,c811396dc2,negative,headache
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,Funeral ceremony...gloomy friday...,9063631ab1,negative,gloomy
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants to hang out with friends SOON!,2a815f151d,positive,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,We want to trade with someone who has Houston...,@dannycastillo We want to trade with someone w...,82565a56d3,neutral,We want to trade with someone who has Houston ...


In [7]:
# train test split
train, test = train_test_split(data2, test_size = 0.2, stratify = data2['sentiment'], random_state=21)

In [8]:
# get the shape of train and test split.
train.shape, test.shape

((32000, 8), (8000, 8))

In [9]:
# create a TF-IDF vectorizer object
tfidf_vectorizer = TfidfVectorizer(lowercase= True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)

In [10]:
# fit the object with the training data tweets
tfidf_vectorizer.fit(train.old_text)

TfidfVectorizer(max_features=1000,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [11]:
# transform the train and test data
train_idf = tfidf_vectorizer.transform(train.old_text)
test_idf  = tfidf_vectorizer.transform(test.old_text)

In [12]:
# create the object of LinearRegression Model
model_LR = LogisticRegression()

In [13]:
model_LR = LogisticRegression(solver='lbfgs', max_iter=1000)

In [14]:
# fit the model with the training data
model_LR.fit(train_idf, train.sentiment)

LogisticRegression(max_iter=1000)

In [15]:
# predict the label on the traning data
predict_train = model_LR.predict(train_idf)

In [16]:
# predict the model on the test data
predict_test = model_LR.predict(test_idf)

In [17]:
# f1 score on train data
f1_score(y_true= train.sentiment, y_pred= predict_train, average='micro')

0.3971875

In [18]:
f1_score(y_true= test.sentiment, y_pred= predict_test, average='micro')

0.35325

In [19]:
# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,
                                                      max_features=1000,
                                                      stop_words= ENGLISH_STOP_WORDS)),
                            ('model', LogisticRegression(solver='lbfgs', max_iter=1000))])



In [20]:
# fit the pipeline model with the training data                            
pipeline.fit(train.old_text, train.sentiment)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
           

In [29]:
# sample tweet
text = ["i love her on the outside but hate her from the inside"]

# predict the label using the pipeline
pipeline.predict(text)

array(['love'], dtype=object)

In [22]:
from joblib import dump

# dump the pipeline model
dump(pipeline, filename="text_classification.joblib")

['text_classification.joblib']

In [23]:
data2[data2.sentiment == 'worry']

Unnamed: 0,textID,sentiment,author,text,old_text,aux_id,new_sentiment,selected_text
5,1956968477,worry,xxxPEACHESxxx,Re-pinging : why didn`t you go to prom? BC my ...,Re-pinging @ghostridah14: why didn't you go to...,a610d6b25b,negative,didn`t like my
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down,Hmmm. http://www.djhero.com/ is down,2dfbe0b7fb,negative,
11,1956969531,worry,dudeitsmanda,Choked on her retainers,Choked on her retainers,133109505a,negative,Choked on her retainers
18,1956971473,worry,LCJ82,lady gaga tweeted about not being impressed b...,@PerezHilton lady gaga tweeted about not being...,23f0f2d1f3,negative,not being impressed
20,1956971981,worry,andreagauster,oh too bad! I hope it gets better. I`ve been ...,@raaaaaaek oh too bad! I hope it gets better. ...,ce2c823958,neutral,oh too bad! I hope it gets better. I`ve been h...
...,...,...,...,...,...,...,...,...
39936,1753903426,worry,carastjohn,tomorrow is going to be sooo awkward & embaras...,tomorrow is going to be sooo awkward &amp; emb...,09140327f6,neutral,
39938,1753903505,worry,primatage,"hey! negative on the primatech, this handle`s...","@icebergstorm hey! negative on the primatech, ...",7f7bd175fb,negative,negative
39941,1753903578,worry,somemandy,sure. But be careful also of making statement...,@PH7S sure. But be careful also of making stat...,a0b022f817,positive,But be careful also of making statements that ...
39956,1753903987,worry,nadszy,How Do You Sleep - Jesse McCartney,How Do You Sleep - Jesse McCartney,f63400d9fb,neutral,How Do You Sleep - Jesse McCartney
