In [1]:
import pandas as pd
import numpy as np
import seaborn as sns


In [2]:
train_dataset =pd.read_csv("train.csv", encoding="ISO-8859-1")
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Explotary Data Analaysis (EDA)

### Preprocessing

In [3]:
## Check the information about the training set

In [4]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
## Check the duplicated entries of the training set

In [6]:
train_dataset.duplicated().sum()

0

In [7]:
## Get the value counts of keyword column

In [8]:
train_dataset['keyword'].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [9]:
## Get the value counts of location column

In [10]:
train_dataset['location'].value_counts()

location
USA                   104
New York               71
United States          50
London                 45
Canada                 29
                     ... 
Montr?al, Qu?bec        1
Montreal                1
?T: 6.4682,3.18287      1
Live4Heed??             1
Lincoln                 1
Name: count, Length: 3340, dtype: int64

In [11]:
train_dataset['location']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
7608    NaN
7609    NaN
7610    NaN
7611    NaN
7612    NaN
Name: location, Length: 7613, dtype: object

In [12]:
## check the null values of the keyword column

In [13]:
train_dataset['keyword'].isnull().sum()

61

In [14]:
## check the null values of the location column

In [15]:
train_dataset['location'].isnull().sum()

2533

In [16]:
## fill the nul values

In [17]:
 train_dataset['location'].fillna('Canada', inplace=True)
 train_dataset['keyword'].fillna('radiation%20emergency', inplace=True)

In [18]:
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,radiation%20emergency,Canada,Our Deeds are the Reason of this #earthquake M...,1
1,4,radiation%20emergency,Canada,Forest fire near La Ronge Sask. Canada,1
2,5,radiation%20emergency,Canada,All residents asked to 'shelter in place' are ...,1
3,6,radiation%20emergency,Canada,"13,000 people receive #wildfires evacuation or...",1
4,7,radiation%20emergency,Canada,Just got sent this photo from Ruby #Alaska as ...,1


In [19]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7613 non-null   object
 2   location  7613 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


#### Text Preprcoessing

In [20]:
import re
import string

In [21]:
train_dataset['text'].head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [22]:
train_dataset['keyword'].head()

0    radiation%20emergency
1    radiation%20emergency
2    radiation%20emergency
3    radiation%20emergency
4    radiation%20emergency
Name: keyword, dtype: object

In [23]:
train_dataset['location'].head()

0    Canada
1    Canada
2    Canada
3    Canada
4    Canada
Name: location, dtype: object

In [24]:
## convert the uppercase letters into the lowercase letters

In [25]:
train_dataset['text'] = train_dataset['text'].apply(lambda x:" ".join(x.lower() for x in x.split()))
train_dataset['text']

0       our deeds are the reason of this #earthquake m...
1                  forest fire near la ronge sask. canada
2       all residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       just got sent this photo from ruby #alaska as ...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @thetawniest the out of control w...
7610    m1.94 [01:04 utc]?5km s of volcano hawaii. htt...
7611    police investigating after an e-bike collided ...
7612    the latest: more homes razed by northern calif...
Name: text, Length: 7613, dtype: object

In [26]:
train_dataset['location']=train_dataset['location'].apply(lambda x:" ".join(x.lower() for x in x.split()))
train_dataset['location'].iloc[100:200]

100                         uk
101             nairobi, kenya
102    instagram - @heyimginog
103                        304
104                switzerland
                ...           
195        || c h i c a g o ||
196                     canada
197                      l. a.
198                     canada
199                     canada
Name: location, Length: 100, dtype: object

In [27]:
## Remove the links

In [28]:
train_dataset['text']=train_dataset['text'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*','',x,flags=re.MULTILINE) for x in x.split()))
train_dataset['text'].head()

0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    just got sent this photo from ruby #alaska as ...
Name: text, dtype: object

In [29]:
## Remove Punctuations

In [30]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [31]:
def remove_puncutations_of_text(text):
    for punctuation in string.punctuation:
        text=text.replace(punctuation, '')
    return text
train_dataset['text']=train_dataset['text'].apply(remove_puncutations_of_text)

In [32]:
train_dataset['text'].head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3    13000 people receive wildfires evacuation orde...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

In [33]:
def remove_punctuations_of_location(text):
    for punctuations in string.punctuation:
        location=text.replace(punctuations,'')
    return location
train_dataset['location']=train_dataset['location'].apply(remove_punctuations_of_location)

In [34]:
train_dataset['location'].head()

0    canada
1    canada
2    canada
3    canada
4    canada
Name: location, dtype: object

In [35]:
def remove_punctuations_of_keyword(text):
    for punctuations in string.punctuation:
        keyword=text.replace(punctuations,'')
    return keyword
train_dataset['keyword']=train_dataset['keyword'].apply(remove_punctuations_of_keyword)

In [36]:
train_dataset['keyword'].head()

0    radiation%20emergency
1    radiation%20emergency
2    radiation%20emergency
3    radiation%20emergency
4    radiation%20emergency
Name: keyword, dtype: object

In [37]:
## remove the numbers

In [38]:
train_dataset['text']=train_dataset['text'].str.replace('\d+','',regex=True)

  train_dataset['text']=train_dataset['text'].str.replace('\d+','',regex=True)


In [39]:
train_dataset['text'].head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3     people receive wildfires evacuation orders in...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

In [40]:
train_dataset['keyword']=train_dataset['keyword'].str.replace('\d+','',regex=True)


  train_dataset['keyword']=train_dataset['keyword'].str.replace('\d+','',regex=True)


In [41]:
train_dataset['keyword'].head()

0    radiation%emergency
1    radiation%emergency
2    radiation%emergency
3    radiation%emergency
4    radiation%emergency
Name: keyword, dtype: object

In [42]:
## Remove the stop words

In [43]:
import nltk

In [44]:
nltk.download('stopwords', download_dir="static/model") 


[nltk_data] Downloading package stopwords to static/model...
[nltk_data]   Package stopwords is already up-to-date!


True

In [45]:
with open('./static/model/corpora/stopwords/english', 'r') as file:
    sw =file.read().splitlines()
    

In [46]:
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [47]:
train_dataset['text']=train_dataset['text'].apply(lambda x:" ".join(x for x in x.split() if x not in sw))
train_dataset['text'].head()

0         deeds reason earthquake may allah forgive us
1                forest fire near la ronge sask canada
2    residents asked shelter place notified officer...
3    people receive wildfires evacuation orders cal...
4    got sent photo ruby alaska smoke wildfires pou...
Name: text, dtype: object

In [48]:
train_dataset['keyword']=train_dataset['keyword'].apply(lambda x:" ".join(x for x in x.split() if x not in sw))
train_dataset['keyword'].head()

0    radiation%emergency
1    radiation%emergency
2    radiation%emergency
3    radiation%emergency
4    radiation%emergency
Name: keyword, dtype: object

In [49]:
## Stemming technique

In [50]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [51]:
train_dataset['text']=train_dataset['text'].apply(lambda x:" ".join(ps.stem(x) for x in x.split()))
train_dataset['text'].head()

0            deed reason earthquak may allah forgiv us
1                 forest fire near la rong sask canada
2    resid ask shelter place notifi offic evacu she...
3          peopl receiv wildfir evacu order california
4    got sent photo rubi alaska smoke wildfir pour ...
Name: text, dtype: object

In [52]:
train_dataset['keyword']=train_dataset['keyword'].apply(lambda x:" ".join(ps.stem(x) for x in x.split()))
train_dataset['keyword'].head()

0    radiation%emerg
1    radiation%emerg
2    radiation%emerg
3    radiation%emerg
4    radiation%emerg
Name: keyword, dtype: object

In [53]:
## print the training dataset

In [54]:
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,radiation%emerg,canada,deed reason earthquak may allah forgiv us,1
1,4,radiation%emerg,canada,forest fire near la rong sask canada,1
2,5,radiation%emerg,canada,resid ask shelter place notifi offic evacu she...,1
3,6,radiation%emerg,canada,peopl receiv wildfir evacu order california,1
4,7,radiation%emerg,canada,got sent photo rubi alaska smoke wildfir pour ...,1


In [55]:
## Building the vocabulary

In [56]:
from collections import Counter
vocab=Counter()

In [57]:
vocab

Counter()

In [58]:
for sentence in train_dataset['text']:
    vocab.update(sentence.split())

In [59]:
len(vocab)

13733

In [60]:
for words in train_dataset['keyword']:
    vocab.update(words.split())

In [61]:
len(vocab)

13764

In [62]:
len(vocab)

13764

In [63]:
tokens = [key for key in vocab if vocab[key]>10 and vocab[key]<75]

In [64]:
len(tokens)

1146

In [65]:
## save the vocabulary

In [66]:
def save_vocabulary(lines, file_path):
    data = '\n'.join(lines)
    file=open(file_path, 'w', encoding="utf-8")
    file.write(data)
    file.close()
save_vocabulary(tokens, "./static/model/vocabulary.txt")

In [67]:
## Divide the dataset 

In [68]:
x=train_dataset.drop('target', axis=1)

In [69]:
x=train_dataset.drop({'id','target'},axis=1)

In [70]:
x

Unnamed: 0,keyword,location,text
0,radiation%emerg,canada,deed reason earthquak may allah forgiv us
1,radiation%emerg,canada,forest fire near la rong sask canada
2,radiation%emerg,canada,resid ask shelter place notifi offic evacu she...
3,radiation%emerg,canada,peopl receiv wildfir evacu order california
4,radiation%emerg,canada,got sent photo rubi alaska smoke wildfir pour ...
...,...,...,...
7608,radiation%emerg,canada,two giant crane hold bridg collaps nearbi home
7609,radiation%emerg,canada,ariaahrari thetawniest control wild fire calif...
7610,radiation%emerg,canada,utckm volcano hawaii
7611,radiation%emerg,canada,polic investig ebik collid car littl portug eb...


In [71]:
y=train_dataset['target']

In [72]:
y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [73]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=42)

In [74]:
x_train.shape

(6090, 3)

In [75]:
y_train.shape

(6090,)

In [76]:
x_test.shape

(1523, 3)

In [136]:
x_test.head()

Unnamed: 0,keyword,location,text
2644,destruct,canada,new weapon caus unimagin destruct
2227,delug,canada,famp thing gishwh got soak delug go pad tampon...
5448,polic,uk,dt georgegalloway rt gallowaymayor col polic c...
132,aftershock,canada,aftershock back school kick great want thank e...
6845,trauma,"montgomery county, md",respons trauma children addict develop defens ...


In [129]:
## combined the x_train columns text, location, Keyword columns

In [131]:
combined_pd_x_train = x_train['keyword']+" "+x_train['location']+" "+x_train['text']
combined_pd_x_train

4996    militari texas courag honest analysi need use ...
3263    engulf canada zachzaidman thescor wld b shame ...
4907    massacr cottonwood arizona tell barackobama re...
2855    drought spokane, wa worri ca drought might aff...
4716    lava medan,indonesia youngheroesid lava blast ...
                              ...                        
5226    obliter merica! egan arent mani obliter server...
5390    panic canada panic attack bc dont enough money...
860     blood canada omron hemc automat blood pressur ...
7603    radiation%emerg canada offici say quarantin pl...
7270    whirlwind stamford & cork (& shropshire) move ...
Length: 6090, dtype: object

In [130]:
## combined the x_test columns text, location, keywords columns

In [138]:
combined_pd_x_test=x_test['keyword']+" "+x_test['location']+" "+x_test['text']
combined_pd_x_test

2644    destruct canada new weapon caus unimagin destruct
2227    delug canada famp thing gishwh got soak delug ...
5448    polic uk dt georgegalloway rt gallowaymayor co...
132     aftershock canada aftershock back school kick ...
6845    trauma montgomery county, md respons trauma ch...
                              ...                        
1835             crash somewhere smusx skype crash u host
506     attack arundel christian attack muslim templ m...
3592    fatal new south wales, australia man charg fat...
6740    thunderstorm canada usnwsgov sever weather sta...
1634    collaps canada great british ltbgtbakeltbgt of...
Length: 1523, dtype: object

In [77]:
## Conver the text into  numerical values

In [78]:
import numpy as np

def vectorizer(ds, vocabulary):
    vectorizer_list = []
    vocab_set=set(vocabulary)
    for sentence in ds:
        sentence_list = np.zeros(len(vocabulary), dtype=np.float16)  # Create a zero array for each sentence
        sentence_words = set(sentence.split())  # Convert the sentence into a set of words
        
        for i, word in enumerate(vocabulary):  # Iterate over the vocabulary (list, ordered)
            if word in sentence_words:  # Check if the word exists in the sentence
                sentence_list[i] = 1  # Set the corresponding index to 1
                
        vectorizer_list.append(sentence_list)
    
    # Convert the list of arrays to a numpy array
    vectorizer_list_new = np.asarray(vectorizer_list, dtype=np.float16)
    
    return vectorizer_list_new


In [134]:
vectorizer_x_train= vectorizer(combined_pd_x_train, tokens)
vectorizer_x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float16)

In [139]:
vectorizer_x_test=vectorizer(combined_pd_x_test,tokens)
vectorizer_x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float16)

In [85]:
## Check the vlue of counts of y train

In [86]:
y_train.value_counts()

target
0    3468
1    2622
Name: count, dtype: int64

In [87]:
vectorizer_x_train.shape

(6090, 1146)

In [88]:
y_train.shape

(6090,)

In [89]:
vectorizer_x_test.shape

(1523, 1146)

In [90]:
vectorizer_x_train_pd = pd.DataFrame(vectorizer_x_train)
vectorizer_x_train_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1136,1137,1138,1139,1140,1141,1142,1143,1144,1145
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
## fix the imbalance data issue

In [92]:
y_train.value_counts()

target
0    3468
1    2622
Name: count, dtype: int64

In [93]:
from imblearn.over_sampling import SMOTE

smote=SMOTE()
vectorized_x_train_smote , y_train_smote = smote.fit_resample(vectorizer_x_train,y_train)
print(vectorized_x_train_smote.shape, y_train_smote.shape)

(6936, 1146) (6936,)


##  Model Selectiona And Model Training

In [94]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [95]:
## logistci regression model

In [96]:
lr=LogisticRegression()
lr.fit(vectorized_x_train_smote,y_train_smote)

In [97]:
lr_predictions = lr.predict(vectorizer_x_test)
lr_predictions[1:10]

array([0, 0, 0, 0, 1, 1, 0, 1, 0], dtype=int64)

In [98]:
accuracy_score(y_test,lr_predictions)

0.6296782665791202

In [99]:
print(classification_report(y_test,lr_predictions))

              precision    recall  f1-score   support

           0       0.62      0.89      0.73       874
           1       0.66      0.27      0.39       649

    accuracy                           0.63      1523
   macro avg       0.64      0.58      0.56      1523
weighted avg       0.64      0.63      0.59      1523



In [100]:
svc=SVC()
svc.fit(vectorized_x_train_smote,y_train_smote)

In [101]:
## making the predictions based on the test dataset

In [102]:
svc_predictions =svc.predict(vectorizer_x_test)
svc_predictions[1:100]

array([0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], dtype=int64)

In [103]:
y_test

2644    1
2227    0
5448    1
132     0
6845    0
       ..
1835    0
506     1
3592    1
6740    1
1634    0
Name: target, Length: 1523, dtype: int64

In [104]:
accuracy_score(svc_predictions, y_test)

0.6263952724885096

In [105]:
print(classification_report(svc_predictions, y_test))

              precision    recall  f1-score   support

           0       0.89      0.62      0.73      1247
           1       0.27      0.64      0.38       276

    accuracy                           0.63      1523
   macro avg       0.58      0.63      0.56      1523
weighted avg       0.78      0.63      0.67      1523



In [106]:
rf_clf =RandomForestClassifier()
rf_clf.fit(vectorized_x_train_smote,y_train_smote)

In [107]:
## Make the predictions based on test set

In [108]:
rf_clf_predictions= rf_clf.predict(vectorizer_x_test)
rf_clf_predictions[10:100]

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1], dtype=int64)

In [109]:
accuracy_score(y_test,rf_clf_predictions)

0.6296782665791202

In [110]:
dt_clf=DecisionTreeClassifier()
dt_clf.fit(vectorizer_x_train,y_train)

In [111]:
## Make the predictions based on test sety

In [112]:
dt_clf_predictions =dt_clf.predict(vectorizer_x_test)
dt_clf_predictions[10:100]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0], dtype=int64)

In [113]:
accuracy_score(y_test,dt_clf_predictions)

0.6349310571240971

In [114]:
from sklearn.model_selection import GridSearchCV

In [115]:
param_grid_lr ={"penalty":['l1', 'l2','elasticnet', 'None'],"random_state":[42,82,102,123],"C":[1.0,2.0,3.0],"fit_intercept":[True,False],"solver":['lbfgs', 'liblinear', 'newton-cg','newton-cholesky', 'sag', 'saga']}

In [116]:
param_grid_lr

{'penalty': ['l1', 'l2', 'elasticnet', 'None'],
 'random_state': [42, 82, 102, 123],
 'C': [1.0, 2.0, 3.0],
 'fit_intercept': [True, False],
 'solver': ['lbfgs',
  'liblinear',
  'newton-cg',
  'newton-cholesky',
  'sag',
  'saga']}

In [117]:
lr_grid_search_cv = GridSearchCV(estimator=lr,param_grid=param_grid_lr)
lr_grid_search_cv.fit(vectorized_x_train_smote,y_train_smote)

1920 fits failed out of a total of 2880.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\THIS PC\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\THIS PC\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\THIS PC\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check

#### best parameters of logistic regression

In [118]:
lr_grid_search_cv.best_params_

{'C': 2.0,
 'fit_intercept': False,
 'penalty': 'l1',
 'random_state': 123,
 'solver': 'saga'}

In [119]:
lr_grid_search_cv_predictions = lr.predict(vectorizer_x_test)
lr_grid_search_cv_predictions

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [120]:
accuracy_score(y_test, lr_grid_search_cv_predictions)

0.6296782665791202

In [121]:
print(classification_report(y_test,lr_grid_search_cv_predictions))

              precision    recall  f1-score   support

           0       0.62      0.89      0.73       874
           1       0.66      0.27      0.39       649

    accuracy                           0.63      1523
   macro avg       0.64      0.58      0.56      1523
weighted avg       0.64      0.63      0.59      1523



In [122]:
svc_param_grid = {"C":[1.0,2.0,3.0], "kernel":['linear','sqaure','sigmoid', 'precomputed']}

In [123]:
svc_param_grid

{'C': [1.0, 2.0, 3.0],
 'kernel': ['linear', 'sqaure', 'sigmoid', 'precomputed']}

In [124]:
svc_grid_search_cv =GridSearchCV(estimator=svc,param_grid=svc_param_grid)
svc_grid_search_cv.fit(vectorized_x_train_smote,y_train_smote)

ValueError: X should be a square kernel matrix