In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

#### Data Preperation

In [2]:
df=pd.read_csv('Tweets.csv')

In [3]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
df.shape

(14640, 15)

In [5]:
df['airline_sentiment'].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [6]:
df.isna().sum()

tweet_id                            0
airline_sentiment                   0
airline_sentiment_confidence        0
negativereason                   5462
negativereason_confidence        4118
airline                             0
airline_sentiment_gold          14600
name                                0
negativereason_gold             14608
retweet_count                       0
text                                0
tweet_coord                     13621
tweet_created                       0
tweet_location                   4733
user_timezone                    4820
dtype: int64

Making a new dataset, df1 with features as airline_sentiment and text from df

In [7]:
df1=df[['airline_sentiment','text']]

In [8]:
df1

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


In [9]:
df1.shape

(14640, 2)

In [10]:
df1.isna().sum()

airline_sentiment    0
text                 0
dtype: int64

Checking the target value counts

In [12]:
X=df1['text']
y=df1['airline_sentiment']

Checking the target value counts

In [13]:
 y.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

It is evident that the dataset exhibits class imbalance, as the negative class has 9178 entries, the neutral class has 3099 entries, and the positive class has 2363 entries. Consequently, achieving precise predictions becomes challenging. Nevertheless, as our initial step, we will train classification ML models using the imbalanced dataset and assess their accuracy, precision, recall, and F1 score.

#### Text preprocessing 

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
stop_words=stopwords.words('english')
stemmer=PorterStemmer()

In [20]:

cleaned_data=[]
for i in range(len(X)):
    tweet=re.sub('[^a-zA-Z]',' ', X.iloc[i])
    tweet=tweet.lower().split()
    
    tweet=[stemmer.stem(word) for word in tweet if word not in stop_words]
    tweet=' '.join(tweet)
    cleaned_data.append(tweet)
    

In [21]:
cleaned_data

['virginamerica dhepburn said',
 'virginamerica plu ad commerci experi tacki',
 'virginamerica today must mean need take anoth trip',
 'virginamerica realli aggress blast obnoxi entertain guest face amp littl recours',
 'virginamerica realli big bad thing',
 'virginamerica serious would pay flight seat play realli bad thing fli va',
 'virginamerica ye nearli everi time fli vx ear worm go away',
 'virginamerica realli miss prime opportun men without hat parodi http co mwpg grezp',
 'virginamerica well',
 'virginamerica amaz arriv hour earli good',
 'virginamerica know suicid second lead caus death among teen',
 'virginamerica lt pretti graphic much better minim iconographi',
 'virginamerica great deal alreadi think nd trip australia amp even gone st trip yet p',
 'virginamerica virginmedia fli fabul seduct sky u take stress away travel http co ahlxhhkiyn',
 'virginamerica thank',
 'virginamerica sfo pdx schedul still mia',
 'virginamerica excit first cross countri flight lax mco heard n

In [62]:
len(cleaned_data)

14640

In [23]:
cv=CountVectorizer(max_features=3000, stop_words=['virginamerica '])
X=cv.fit_transform(cleaned_data).toarray()



Using lambda function converting sting variables into unique numbers

In [54]:
sentimen_ordering=['negative', 'neutral','positive']
y=y.apply(lambda x: sentimen_ordering.index(x))

In [25]:
y.value_counts()

0    9178
1    3099
2    2363
Name: airline_sentiment, dtype: int64

In [26]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30)

In [27]:
X_train.shape, X_test.shape

((10248, 3000), (4392, 3000))

In [28]:
y_train.shape, y_test.shape

((10248,), (4392,))

##### Model 1 - Naive bayes

In [80]:
model1=MultinomialNB()
model1.fit(X_train,y_train)

MultinomialNB()

In [82]:
y_pred1=model1.predict(X_test)

In [84]:
y_pred1

array([0, 2, 0, ..., 0, 1, 0], dtype=int64)

In [86]:
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.82      0.89      0.85      2742
           1       0.60      0.50      0.55       923
           2       0.72      0.66      0.69       727

    accuracy                           0.77      4392
   macro avg       0.71      0.68      0.70      4392
weighted avg       0.76      0.77      0.76      4392



##### model2 - Support Vector Machine

In [69]:
model2=SVC()

In [89]:
model2.fit(X_train, y_train)

SVC()

In [90]:
y_pred2=model2.predict(X_test)

In [91]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.81      0.94      0.87      2742
           1       0.68      0.47      0.56       923
           2       0.81      0.62      0.70       727

    accuracy                           0.79      4392
   macro avg       0.77      0.68      0.71      4392
weighted avg       0.78      0.79      0.77      4392



##### Model3 - LogisticRegression

In [92]:
model3=LogisticRegression()
model3.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [93]:
y_pred3=model3.predict(X_test)

In [94]:
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86      2742
           1       0.62      0.55      0.58       923
           2       0.75      0.69      0.72       727

    accuracy                           0.78      4392
   macro avg       0.74      0.71      0.72      4392
weighted avg       0.78      0.78      0.78      4392



##### model4 - RandomForestClassifier

In [95]:
model4=RandomForestClassifier()
model4.fit(X_train, y_train)

RandomForestClassifier()

In [96]:
y_pred4=model4.predict(X_test)

In [97]:
print(classification_report(y_test, y_pred4))

              precision    recall  f1-score   support

           0       0.81      0.91      0.85      2742
           1       0.62      0.47      0.54       923
           2       0.77      0.63      0.69       727

    accuracy                           0.77      4392
   macro avg       0.73      0.67      0.69      4392
weighted avg       0.76      0.77      0.76      4392



#### Now balancing imbalanced dataset

##### Method 1 - UnderSampling

Undersampling is a technique used to address class imbalance in a dataset by reducing the number of instances in the majority class(es) to achieve a more balanced representation of the classes. This involves randomly selecting a subset of data points from the majority class(es) such that the resulting dataset has a balanced distribution of classes.

Before proceeding with undersampling, let's create a function called "clean_text_corpus" that will handle the task of cleaning the text. This function will perform several operations such as removing non-alphabetic characters, converting the text to lowercase, removing stop words, stemming the words, and finally joining the cleaned words into a single string, which will be saved in the "cleaned_data" list. By implementing this function, we can easily call it for every method, ensuring consistent and efficient text cleaning throughout the process.

In [46]:
def clean_text_corpus(text_corpus):
    cleaned_data = []
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    
    for tweet in text_corpus:
        # Remove non-alphabetic characters
        tweet = re.sub('[^a-zA-Z]', ' ', tweet)
        
        # Convert to lowercase and split into individual words
        tweet = tweet.lower().split()
        
        # Remove stop words and apply stemming
        tweet = [stemmer.stem(word) for word in tweet if word not in stop_words]
        
        # Join the cleaned words back into a single string
        tweet = ' '.join(tweet)
        
        cleaned_data.append(tweet)
        
    
    return cleaned_data


In [112]:
df1

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


In [120]:
df1['airline_sentiment']=df1['airline_sentiment'].map({'negative':0,'neutral':1,'positive':2}) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['airline_sentiment']=df1['airline_sentiment'].map({'negative':0,'neutral':1,'positive':2})


In [121]:
df1['airline_sentiment'].value_counts()

0    9178
1    3099
2    2363
Name: airline_sentiment, dtype: int64

In [122]:
df_negative=df1[df1['airline_sentiment']==0]
df_neutral=df1[df1['airline_sentiment']==1]
df_positive=df1[df1['airline_sentiment']==2]

In [123]:
df_negative.shape, df_neutral.shape, df_positive.shape

((9178, 2), (3099, 2), (2363, 2))

Making Undersamples of majority classes df_negative and df_neutral

In [157]:
df_negative=df_negative.sample(2363)
df_neutral=df_neutral.sample(2363)

In [159]:
df_negative.shape, df_neutral.shape

((2363, 2), (2363, 2))

In [160]:
df_unsample=pd.concat([df_negative,df_neutral,df_positive])

In [161]:
df_unsample.reset_index()

Unnamed: 0,index,airline_sentiment,text
0,5901,0,@SouthwestAir can't DM you without you followi...
1,4531,0,@SouthwestAir has become like every other airl...
2,14218,0,@AmericanAir flight from JFK to EGE was cxld a...
3,1402,0,@united 24 hrs since flight landed and ZERO in...
4,14100,0,"@AmericanAir @manuel_c ""here for you"" as in, y..."
...,...,...,...
7084,14623,2,@AmericanAir Love the new planes for the JFK-L...
7085,14625,2,@AmericanAir Flight 236 was great. Fantastic c...
7086,14628,2,Thank you. “@AmericanAir: @jlhalldc Customer R...
7087,14630,2,@AmericanAir Thanks! He is.


In [162]:
X=df_unsample['text']
y=df_unsample['airline_sentiment']

In [163]:
X.shape

(7089,)

In [164]:
clean_text_corpus(X)

['southwestair dm without follow',
 'southwestair becom like everi airlin crook believ take shouldwearmask shock',
 'americanair flight jfk ege cxld runway hour flight move give luggag back nice',
 'unit hr sinc flight land zero info miss bag rough eta would huge help restor confid',
 'americanair manuel c yeah tweet platitud think twitter bot care',
 'unit direct messag hear anyth back',
 'southwestair hold hour call got disconnect thank lot',
 'unit flight yyc provid free food allow back board broken lightbulb',
 'usairway need learn oper sit plane overnight gsp realiz mech problem delay flight hour',
 'usairway would kill let know mani minut might hold',
 'usairway understood simpli feel staff entir truth situat',
 'usairway honestli quickli possibl hold sine pm everyon answer send someon els',
 'jetblu respond friend lisap respond directli',
 'unit flight delay hour lost wallet money eat sleep yet repres help unitedsuck',
 'usairway brother brizzyberg miss flight travel home funer 

In [165]:
X.shape

(7089,)

In [166]:
X=cv.fit_transform(X).toarray()

In [167]:
X.shape

(7089, 3000)

In [168]:
y.value_counts()

0    2363
1    2363
2    2363
Name: airline_sentiment, dtype: int64

In [169]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25, random_state=15, stratify=y)

In [170]:
y_test.value_counts()

0    591
1    591
2    591
Name: airline_sentiment, dtype: int64

In [171]:
y_train.value_counts()

2    1772
1    1772
0    1772
Name: airline_sentiment, dtype: int64

Now data is undersampled for training, lte's start training each models and check classification report

##### MODEL 1

In [172]:
model1.fit(X_train,y_train)

MultinomialNB()

In [173]:
y_pred1=model1.predict(X_test)

In [174]:
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.73      0.79      0.76       591
           1       0.73      0.64      0.68       591
           2       0.77      0.80      0.78       591

    accuracy                           0.74      1773
   macro avg       0.74      0.74      0.74      1773
weighted avg       0.74      0.74      0.74      1773



##### MODEL 2

In [175]:
model2.fit(X_train,y_train)
y_pred2=model2.predict(X_test)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.74      0.79      0.76       591
           1       0.69      0.75      0.72       591
           2       0.86      0.72      0.78       591

    accuracy                           0.75      1773
   macro avg       0.76      0.75      0.75      1773
weighted avg       0.76      0.75      0.75      1773



##### MODEL 3

In [176]:
model3.fit(X_train,y_train)
y_pred3=model3.predict(X_test)
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.78      0.74      0.76       591
           1       0.70      0.76      0.73       591
           2       0.81      0.79      0.80       591

    accuracy                           0.76      1773
   macro avg       0.77      0.76      0.76      1773
weighted avg       0.77      0.76      0.76      1773



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


##### MODEL 4

In [177]:
model4.fit(X_train,y_train)
y_pred4=model4.predict(X_test)
print(classification_report(y_test, y_pred4))

              precision    recall  f1-score   support

           0       0.73      0.74      0.74       591
           1       0.66      0.74      0.70       591
           2       0.83      0.73      0.78       591

    accuracy                           0.73      1773
   macro avg       0.74      0.73      0.74      1773
weighted avg       0.74      0.73      0.74      1773



Method 2 - Oversampling

In [178]:
df_negative=df1[df1['airline_sentiment']==0]
df_neutral=df1[df1['airline_sentiment']==1]
df_positive=df1[df1['airline_sentiment']==2]

In [179]:
df_negative.shape, df_neutral.shape, df_positive.shape

((9178, 2), (3099, 2), (2363, 2))

In [180]:
df_neutral=df_neutral.sample(9178, replace=True)
df_positive=df_positive.sample(9178, replace=True)

In [181]:
df_negative.shape, df_neutral.shape, df_positive.shape

((9178, 2), (9178, 2), (9178, 2))

In [182]:
df_ovr_sample=pd.concat([df_negative,df_neutral,df_positive])

In [183]:
df_ovr_sample.shape

(27534, 2)

In [187]:
X=df_ovr_sample['text']
y=df_ovr_sample['airline_sentiment']

In [188]:
X.shape

(27534,)

In [189]:
clean_text_corpus(X)

['virginamerica realli aggress blast obnoxi entertain guest face amp littl recours',
 'virginamerica realli big bad thing',
 'virginamerica serious would pay flight seat play realli bad thing fli va',
 'virginamerica sfo pdx schedul still mia',
 'virginamerica flew nyc sfo last week fulli sit seat due two larg gentleman either side help',
 'virginamerica first fare may three time carrier seat avail select',
 'virginamerica guy mess seat reserv seat friend guy gave seat away want free internet',
 'virginamerica statu match program appli three week call email respons',
 'virginamerica happen ur vegan food option least say ur site know abl eat anyth next hr fail',
 'virginamerica amaz get cold air vent vx noair worstflightev roast sfotobo',
 'virginamerica hi bked cool birthday trip add elev caus enter middl name flight book problem',
 'virginamerica help left expens headphon flight iad lax today seat one answer l amp f number lax',
 'virginamerica await return phone call would prefer use

In [190]:
X.shape

(27534,)

In [191]:
X=cv.fit_transform(X).toarray()

In [193]:
X.shape

(27534, 3000)

In [194]:
y.value_counts()

0    9178
1    9178
2    9178
Name: airline_sentiment, dtype: int64

In [195]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25, random_state=15, stratify=y)

In [196]:
y_train.value_counts()

2    6884
0    6883
1    6883
Name: airline_sentiment, dtype: int64

In [197]:
y_test.value_counts()

0    2295
1    2295
2    2294
Name: airline_sentiment, dtype: int64

In [198]:
model1.fit(X_train,y_train)
y_pred1=model1.predict(X_test)
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80      2295
           1       0.77      0.69      0.73      2295
           2       0.83      0.86      0.84      2294

    accuracy                           0.79      6884
   macro avg       0.79      0.79      0.79      6884
weighted avg       0.79      0.79      0.79      6884



In [199]:
model2.fit(X_train,y_train)
y_pred2=model2.predict(X_test)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.93      0.87      0.90      2295
           1       0.86      0.92      0.89      2295
           2       0.94      0.94      0.94      2294

    accuracy                           0.91      6884
   macro avg       0.91      0.91      0.91      6884
weighted avg       0.91      0.91      0.91      6884



In [200]:
model3.fit(X_train,y_train)
y_pred3=model3.predict(X_test)
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.89      0.80      0.84      2295
           1       0.82      0.86      0.84      2295
           2       0.89      0.93      0.91      2294

    accuracy                           0.86      6884
   macro avg       0.87      0.86      0.86      6884
weighted avg       0.87      0.86      0.86      6884



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [201]:
model4.fit(X_train,y_train)
y_pred4=model4.predict(X_test)
print(classification_report(y_test, y_pred4))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92      2295
           1       0.91      0.95      0.93      2295
           2       0.96      0.98      0.97      2294

    accuracy                           0.94      6884
   macro avg       0.94      0.94      0.94      6884
weighted avg       0.94      0.94      0.94      6884



#### Method 3 - SMOTE

In [32]:
df1=df[['airline_sentiment','text']]
X=df1['text']
y=df1['airline_sentiment']

In [47]:
clean_text_corpus(X)

['virginamerica dhepburn said',
 'virginamerica plu ad commerci experi tacki',
 'virginamerica today must mean need take anoth trip',
 'virginamerica realli aggress blast obnoxi entertain guest face amp littl recours',
 'virginamerica realli big bad thing',
 'virginamerica serious would pay flight seat play realli bad thing fli va',
 'virginamerica ye nearli everi time fli vx ear worm go away',
 'virginamerica realli miss prime opportun men without hat parodi http co mwpg grezp',
 'virginamerica well',
 'virginamerica amaz arriv hour earli good',
 'virginamerica know suicid second lead caus death among teen',
 'virginamerica lt pretti graphic much better minim iconographi',
 'virginamerica great deal alreadi think nd trip australia amp even gone st trip yet p',
 'virginamerica virginmedia fli fabul seduct sky u take stress away travel http co ahlxhhkiyn',
 'virginamerica thank',
 'virginamerica sfo pdx schedul still mia',
 'virginamerica excit first cross countri flight lax mco heard n

In [48]:
X=cv.fit_transform(X).toarray()

In [33]:
X.shape, y.shape

((14640,), (14640,))

In [56]:
y

0        1
1        2
2        1
3        0
4        0
        ..
14635    2
14636    0
14637    1
14638    0
14639    1
Name: airline_sentiment, Length: 14640, dtype: int64

In [57]:
y.value_counts()

0    9178
1    3099
2    2363
Name: airline_sentiment, dtype: int64

In [62]:
smote = SMOTE(sampling_strategy={0: 9178, 1: 9178, 2: 9178})
X_sm, y_sm = smote.fit_resample(X, y)

In [77]:
X_sm.shape, y_sm.shape

((27534, 3000), (27534,))

In [66]:
y_sm.value_counts()

1    9178
2    9178
0    9178
Name: airline_sentiment, dtype: int64

In [79]:
X_train,X_test,y_train,y_test=train_test_split(X_sm,y_sm,test_size=0.25, random_state=15, stratify=y_sm)

MODEL 1

In [80]:
model1.fit(X_train,y_train)
y_pred1=model1.predict(X_test)
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85      2295
           1       0.73      0.53      0.61      2295
           2       0.68      0.83      0.75      2294

    accuracy                           0.74      6884
   macro avg       0.74      0.74      0.73      6884
weighted avg       0.74      0.74      0.73      6884



MODEL 2

In [81]:
model2.fit(X_train,y_train)
y_pred2=model2.predict(X_test)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.81      0.92      0.86      2295
           1       0.69      0.77      0.73      2295
           2       0.86      0.63      0.73      2294

    accuracy                           0.77      6884
   macro avg       0.78      0.77      0.77      6884
weighted avg       0.78      0.77      0.77      6884



MODEL 3

In [82]:
model3.fit(X_train,y_train)
y_pred3=model3.predict(X_test)
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86      2295
           1       0.67      0.80      0.73      2295
           2       0.81      0.68      0.74      2294

    accuracy                           0.77      6884
   macro avg       0.78      0.77      0.78      6884
weighted avg       0.78      0.77      0.78      6884



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


MODEL 4

In [83]:
model4.fit(X_train,y_train)
y_pred4=model4.predict(X_test)
print(classification_report(y_test, y_pred4))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85      2295
           1       0.71      0.79      0.75      2295
           2       0.84      0.69      0.76      2294

    accuracy                           0.78      6884
   macro avg       0.79      0.78      0.78      6884
weighted avg       0.79      0.78      0.78      6884

