In [1]:
import pandas as pd
import numpy as np
import seaborn as sns


In [2]:
train_dataset =pd.read_csv("train.csv", encoding="ISO-8859-1")
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Explotary Data Analaysis (EDA)

### Preprocessing

In [3]:
## Check the information about the training set

In [4]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
## Check the duplicated entries of the training set

In [6]:
train_dataset.duplicated().sum()

0

In [7]:
## Get the value counts of keyword column

In [8]:
train_dataset['keyword'].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [9]:
## Get the value counts of location column

In [10]:
train_dataset['location'].value_counts()

location
USA                   104
New York               71
United States          50
London                 45
Canada                 29
                     ... 
Montr?al, Qu?bec        1
Montreal                1
?T: 6.4682,3.18287      1
Live4Heed??             1
Lincoln                 1
Name: count, Length: 3340, dtype: int64

In [11]:
train_dataset['location']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
7608    NaN
7609    NaN
7610    NaN
7611    NaN
7612    NaN
Name: location, Length: 7613, dtype: object

In [12]:
## check the null values of the keyword column

In [13]:
train_dataset['keyword'].isnull().sum()

61

In [14]:
## check the null values of the location column

In [15]:
train_dataset['location'].isnull().sum()

2533

In [16]:
## fill the nul values

In [17]:
 train_dataset['location'].fillna('Canada', inplace=True)
 train_dataset['keyword'].fillna('radiation%20emergency', inplace=True)

In [18]:
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,radiation%20emergency,Canada,Our Deeds are the Reason of this #earthquake M...,1
1,4,radiation%20emergency,Canada,Forest fire near La Ronge Sask. Canada,1
2,5,radiation%20emergency,Canada,All residents asked to 'shelter in place' are ...,1
3,6,radiation%20emergency,Canada,"13,000 people receive #wildfires evacuation or...",1
4,7,radiation%20emergency,Canada,Just got sent this photo from Ruby #Alaska as ...,1


In [19]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7613 non-null   object
 2   location  7613 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


#### Text Preprcoessing

In [20]:
import re
import string

In [21]:
train_dataset['text'].head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [22]:
train_dataset['keyword'].head()

0    radiation%20emergency
1    radiation%20emergency
2    radiation%20emergency
3    radiation%20emergency
4    radiation%20emergency
Name: keyword, dtype: object

In [23]:
train_dataset['location'].head()

0    Canada
1    Canada
2    Canada
3    Canada
4    Canada
Name: location, dtype: object

In [24]:
## convert the uppercase letters into the lowercase letters

In [25]:
train_dataset['text'] = train_dataset['text'].apply(lambda x:" ".join(x.lower() for x in x.split()))
train_dataset['text']

0       our deeds are the reason of this #earthquake m...
1                  forest fire near la ronge sask. canada
2       all residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       just got sent this photo from ruby #alaska as ...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @thetawniest the out of control w...
7610    m1.94 [01:04 utc]?5km s of volcano hawaii. htt...
7611    police investigating after an e-bike collided ...
7612    the latest: more homes razed by northern calif...
Name: text, Length: 7613, dtype: object

In [26]:
train_dataset['location']=train_dataset['location'].apply(lambda x:" ".join(x.lower() for x in x.split()))
train_dataset['location'].iloc[100:200]

100                         uk
101             nairobi, kenya
102    instagram - @heyimginog
103                        304
104                switzerland
                ...           
195        || c h i c a g o ||
196                     canada
197                      l. a.
198                     canada
199                     canada
Name: location, Length: 100, dtype: object

In [27]:
## Remove the links

In [28]:
train_dataset['text']=train_dataset['text'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*','',x,flags=re.MULTILINE) for x in x.split()))
train_dataset['text'].head()

0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    just got sent this photo from ruby #alaska as ...
Name: text, dtype: object

In [29]:
## Remove Punctuations

In [30]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [31]:
def remove_puncutations_of_text(text):
    for punctuation in string.punctuation:
        text=text.replace(punctuation, '')
    return text
train_dataset['text']=train_dataset['text'].apply(remove_puncutations_of_text)

In [32]:
train_dataset['text'].head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3    13000 people receive wildfires evacuation orde...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

In [33]:
def remove_punctuations_of_location(text):
    for punctuations in string.punctuation:
        location=text.replace(punctuations,'')
    return location
train_dataset['location']=train_dataset['location'].apply(remove_punctuations_of_location)

In [34]:
train_dataset['location'].head()

0    canada
1    canada
2    canada
3    canada
4    canada
Name: location, dtype: object

In [35]:
def remove_punctuations_of_keyword(text):
    for punctuations in string.punctuation:
        keyword=text.replace(punctuations,'')
    return keyword
train_dataset['keyword']=train_dataset['keyword'].apply(remove_punctuations_of_keyword)

In [36]:
train_dataset['keyword'].head()

0    radiation%20emergency
1    radiation%20emergency
2    radiation%20emergency
3    radiation%20emergency
4    radiation%20emergency
Name: keyword, dtype: object

In [37]:
## remove the numbers

In [38]:
train_dataset['text']=train_dataset['text'].str.replace('\d+','',regex=True)

  train_dataset['text']=train_dataset['text'].str.replace('\d+','',regex=True)


In [39]:
train_dataset['text'].head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3     people receive wildfires evacuation orders in...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

In [40]:
train_dataset['keyword']=train_dataset['keyword'].str.replace('\d+','',regex=True)


  train_dataset['keyword']=train_dataset['keyword'].str.replace('\d+','',regex=True)


In [41]:
train_dataset['keyword'].head()

0    radiation%emergency
1    radiation%emergency
2    radiation%emergency
3    radiation%emergency
4    radiation%emergency
Name: keyword, dtype: object

In [42]:
## Remove the stop words

In [43]:
import nltk

In [44]:
nltk.download('stopwords', download_dir="static/model") 


[nltk_data] Downloading package stopwords to static/model...
[nltk_data]   Package stopwords is already up-to-date!


True

In [45]:
with open('./static/model/corpora/stopwords/english', 'r') as file:
    sw =file.read().splitlines()
    

In [46]:
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [47]:
train_dataset['text']=train_dataset['text'].apply(lambda x:" ".join(x for x in x.split() if x not in sw))
train_dataset['text'].head()

0         deeds reason earthquake may allah forgive us
1                forest fire near la ronge sask canada
2    residents asked shelter place notified officer...
3    people receive wildfires evacuation orders cal...
4    got sent photo ruby alaska smoke wildfires pou...
Name: text, dtype: object

In [48]:
train_dataset['keyword']=train_dataset['keyword'].apply(lambda x:" ".join(x for x in x.split() if x not in sw))
train_dataset['keyword'].head()

0    radiation%emergency
1    radiation%emergency
2    radiation%emergency
3    radiation%emergency
4    radiation%emergency
Name: keyword, dtype: object

In [49]:
## Stemming technique

In [50]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [51]:
train_dataset['text']=train_dataset['text'].apply(lambda x:" ".join(ps.stem(x) for x in x.split()))
train_dataset['text'].head()

0            deed reason earthquak may allah forgiv us
1                 forest fire near la rong sask canada
2    resid ask shelter place notifi offic evacu she...
3          peopl receiv wildfir evacu order california
4    got sent photo rubi alaska smoke wildfir pour ...
Name: text, dtype: object

In [52]:
train_dataset['keyword']=train_dataset['keyword'].apply(lambda x:" ".join(ps.stem(x) for x in x.split()))
train_dataset['keyword'].head()

0    radiation%emerg
1    radiation%emerg
2    radiation%emerg
3    radiation%emerg
4    radiation%emerg
Name: keyword, dtype: object

In [53]:
## print the training dataset

In [54]:
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,radiation%emerg,canada,deed reason earthquak may allah forgiv us,1
1,4,radiation%emerg,canada,forest fire near la rong sask canada,1
2,5,radiation%emerg,canada,resid ask shelter place notifi offic evacu she...,1
3,6,radiation%emerg,canada,peopl receiv wildfir evacu order california,1
4,7,radiation%emerg,canada,got sent photo rubi alaska smoke wildfir pour ...,1


In [55]:
## Building the vocabulary

In [56]:
from collections import Counter
vocab_train=Counter()

In [57]:
vocab_train

Counter()

In [58]:
for sentence in train_dataset['text']:
    vocab_train.update(sentence.split())

In [59]:
len(vocab_train)

13733

In [60]:
for words in train_dataset['keyword']:
    vocab_train.update(words.split())

In [61]:
len(vocab_train)

13764

In [62]:
len(vocab_train)

13764

In [63]:
#tokens = [key for key in vocab if vocab[key]>10 and vocab[key]<75]

In [64]:
#len(tokens)

In [65]:
## save the vocabulary

In [66]:
def save_vocabulary(lines, file_path):
    data = '\n'.join(lines)
    file=open(file_path, 'w', encoding="utf-8")
    file.write(data)
    file.close()
save_vocabulary(vocab_train, "./static/model/vocabulary.txt")

In [67]:
## Divide the dataset 

In [68]:
x=train_dataset.drop('target', axis=1)

In [69]:
x=train_dataset.drop({'id','target'},axis=1)

In [70]:
x

Unnamed: 0,keyword,location,text
0,radiation%emerg,canada,deed reason earthquak may allah forgiv us
1,radiation%emerg,canada,forest fire near la rong sask canada
2,radiation%emerg,canada,resid ask shelter place notifi offic evacu she...
3,radiation%emerg,canada,peopl receiv wildfir evacu order california
4,radiation%emerg,canada,got sent photo rubi alaska smoke wildfir pour ...
...,...,...,...
7608,radiation%emerg,canada,two giant crane hold bridg collaps nearbi home
7609,radiation%emerg,canada,ariaahrari thetawniest control wild fire calif...
7610,radiation%emerg,canada,utckm volcano hawaii
7611,radiation%emerg,canada,polic investig ebik collid car littl portug eb...


In [71]:
y=train_dataset['target']

In [72]:
y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [73]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=42)

In [74]:
x_train.shape

(6090, 3)

In [75]:
y_train.shape

(6090,)

In [76]:
x_test.shape

(1523, 3)

In [77]:
x_test.head()

Unnamed: 0,keyword,location,text
2644,destruct,canada,new weapon caus unimagin destruct
2227,delug,canada,famp thing gishwh got soak delug go pad tampon...
5448,polic,uk,dt georgegalloway rt gallowaymayor col polic c...
132,aftershock,canada,aftershock back school kick great want thank e...
6845,trauma,"montgomery county, md",respons trauma children addict develop defens ...


In [78]:
## combined the x_train columns text, location, Keyword columns

In [79]:
combined_pd_x_train = x_train['keyword']+" "+x_train['location']+" "+x_train['text']
combined_pd_x_train

4996    militari texas courag honest analysi need use ...
3263    engulf canada zachzaidman thescor wld b shame ...
4907    massacr cottonwood arizona tell barackobama re...
2855    drought spokane, wa worri ca drought might aff...
4716    lava medan,indonesia youngheroesid lava blast ...
                              ...                        
5226    obliter merica! egan arent mani obliter server...
5390    panic canada panic attack bc dont enough money...
860     blood canada omron hemc automat blood pressur ...
7603    radiation%emerg canada offici say quarantin pl...
7270    whirlwind stamford & cork (& shropshire) move ...
Length: 6090, dtype: object

In [80]:
## combined the x_test columns text, location, keywords columns

In [81]:
combined_pd_x_test=x_test['keyword']+" "+x_test['location']+" "+x_test['text']
combined_pd_x_test

2644    destruct canada new weapon caus unimagin destruct
2227    delug canada famp thing gishwh got soak delug ...
5448    polic uk dt georgegalloway rt gallowaymayor co...
132     aftershock canada aftershock back school kick ...
6845    trauma montgomery county, md respons trauma ch...
                              ...                        
1835             crash somewhere smusx skype crash u host
506     attack arundel christian attack muslim templ m...
3592    fatal new south wales, australia man charg fat...
6740    thunderstorm canada usnwsgov sever weather sta...
1634    collaps canada great british ltbgtbakeltbgt of...
Length: 1523, dtype: object

In [82]:
## Conver the text into  numerical values

In [83]:
import numpy as np

def vectorizer(ds, vocabulary):
    vectorizer_list = []
    vocab_set=set(vocabulary)
    for sentence in ds:
        sentence_list = np.zeros(len(vocabulary), dtype=np.float16)  # Create a zero array for each sentence
        sentence_words = set(sentence.split())  # Convert the sentence into a set of words
        
        for i, word in enumerate(vocabulary):  # Iterate over the vocabulary (list, ordered)
            if word in sentence_words:  # Check if the word exists in the sentence
                sentence_list[i] = 1  # Set the corresponding index to 1
                
        vectorizer_list.append(sentence_list)
    
    # Convert the list of arrays to a numpy array
    vectorizer_list_new = np.asarray(vectorizer_list, dtype=np.float16)
    
    return vectorizer_list_new


In [84]:
vectorizer_x_train= vectorizer(combined_pd_x_train,vocab_train)
vectorizer_x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float16)

In [85]:
vectorizer_x_test=vectorizer(combined_pd_x_test,vocab_train)
vectorizer_x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float16)

In [86]:
## Check the vlue of counts of y train

In [87]:
y_train.value_counts()

target
0    3468
1    2622
Name: count, dtype: int64

In [88]:
vectorizer_x_train.shape

(6090, 13764)

In [89]:
y_train.shape

(6090,)

In [90]:
vectorizer_x_test.shape

(1523, 13764)

In [91]:
## fix the imbalance data issue

In [92]:
y_train.value_counts()

target
0    3468
1    2622
Name: count, dtype: int64

In [93]:
from imblearn.over_sampling import SMOTE

smote=SMOTE()
vectorized_x_train_smote , y_train_smote = smote.fit_resample(vectorizer_x_train,y_train)
print(vectorized_x_train_smote.shape, y_train_smote.shape)

(6936, 13764) (6936,)


##  Model Selectiona And Model Training

In [94]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [95]:
## logistci regression model

In [96]:
lr=LogisticRegression(random_state=42)
lr.fit(vectorized_x_train_smote,y_train_smote)

In [97]:
lr_predictions = lr.predict(vectorizer_x_test)
lr_predictions[1:10]

array([0, 1, 0, 0, 0, 0, 0, 0, 1], dtype=int64)

In [98]:
accuracy_score(y_test,lr_predictions)

0.7741300065659882

In [99]:
print(classification_report(y_test,lr_predictions))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80       874
           1       0.73      0.74      0.74       649

    accuracy                           0.77      1523
   macro avg       0.77      0.77      0.77      1523
weighted avg       0.77      0.77      0.77      1523



In [100]:
## Cross Validation Score for logistic regression

In [101]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation with 5 folds (can be adjusted based on dataset size)
cv_scores = cross_val_score(lr,vectorized_x_train_smote, y_train_smote, cv=10)

# Print cross-validation scores and their average
print("Cross-validation scores:", cv_scores)
print("Average cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.81123919 0.78242075 0.82853026 0.81412104 0.8054755  0.79827089
 0.8023088  0.83982684 0.89321789 0.88311688]
Average cross-validation score: 0.8258528055358028


In [102]:
rf_clf =RandomForestClassifier(random_state=42)
rf_clf.fit(vectorized_x_train_smote,y_train_smote)

In [103]:
## Make the predictions based on test set

In [104]:
rf_clf_predictions= rf_clf.predict(vectorizer_x_test)
rf_clf_predictions[10:100]

array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0], dtype=int64)

In [105]:
accuracy_score(y_test,rf_clf_predictions)

0.7879185817465528

In [106]:
print(classification_report(y_test,rf_clf_predictions))

              precision    recall  f1-score   support

           0       0.79      0.86      0.82       874
           1       0.79      0.69      0.73       649

    accuracy                           0.79      1523
   macro avg       0.79      0.77      0.78      1523
weighted avg       0.79      0.79      0.79      1523



In [107]:
## Cross validation score for Random Forest Classifier

In [108]:
from sklearn.model_selection import cross_val_score

cv_scores =cross_val_score(rf_clf,vectorized_x_train_smote,y_train_smote, cv=10)
# Print cross-validation scores and their average
print("Cross-validation scores:", cv_scores)
print("Average cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.7795389  0.76368876 0.80115274 0.78386167 0.76657061 0.77089337
 0.78787879 0.85137085 0.92929293 0.94372294]
Average cross-validation score: 0.8177971564138711


## Test dataset

In [113]:
test_dataset= pd.read_csv("test.csv")
test_dataset.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [114]:
## check the null values of the dataset

In [115]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [116]:
## check the value counts of the keyword

In [117]:
test_dataset['keyword'].value_counts()

keyword
deluged               23
demolished            22
rubble                22
first%20responders    21
seismic               21
                      ..
threat                 5
fatalities             5
forest%20fire          5
inundation             4
epicentre              1
Name: count, Length: 221, dtype: int64

In [118]:
## check the value counts of the location

In [119]:
test_dataset['location'].value_counts()

location
New York                  38
USA                       37
Worldwide                 16
United States             15
London                    13
                          ..
Medford, NJ                1
Quezon City                1
LanÌ¼s                     1
USA,Washington,Seattle     1
Brussels, Belgium          1
Name: count, Length: 1602, dtype: int64

In [120]:
## Fill the null values of the dataset

In [121]:
 test_dataset['location'].fillna('Quezon City', inplace=True)
 test_dataset['keyword'].fillna('epicentre', inplace=True)

In [122]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3263 non-null   object
 2   location  3263 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [123]:
## Text preprocessing

In [124]:
import string
import re

In [125]:
def remove_punctuations_of_keyword(text):
    for punctuations in string.punctuation:
        text=text.replace(punctuations,'')
    return text


In [126]:
def text_preprocess(full_text):
    ## convert text into lowercase
    full_text=full_text.apply(lambda x:" ".join(x.lower() for x in x.split()))
    ##  remove links
    full_text=full_text.apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*','',x,flags=re.MULTILINE) for x in x.split()))
    ## remove the puncutations
    full_text=full_text.apply(remove_punctuations_of_keyword)
    ## Remove the numbers
    full_text=full_text.str.replace('\d+','',regex=True)
    ## Remove the stop words
    full_text=full_text.apply(lambda x:" ".join(x for x in x.split() if x not in sw))
    ## stemming technique
    full_text=full_text.apply(lambda x:" ".join(ps.stem(x) for x in x.split()))

    return full_text
test_dataset_text = text_preprocess(test_dataset['text'])
test_dataset_keyword = text_preprocess(test_dataset['keyword'])
test_dataset_location = text_preprocess(test_dataset['location'])

  full_text=full_text.str.replace('\d+','',regex=True)


In [127]:
test_dataset_location.head()

0    quezon citi
1    quezon citi
2    quezon citi
3    quezon citi
4    quezon citi
Name: location, dtype: object

In [128]:
test_dataset_keyword.head()

0    epicentr
1    epicentr
2    epicentr
3    epicentr
4    epicentr
Name: keyword, dtype: object

In [129]:
test_dataset_text.head()

0                             happen terribl car crash
1        heard earthquak differ citi stay safe everyon
2    forest fire spot pond gees flee across street ...
3                       apocalyps light spokan wildfir
4                   typhoon soudelor kill china taiwan
Name: text, dtype: object

In [130]:
## create the vocabulary 

In [131]:
#from collections import Counter

#vocab_test = Counter()

In [132]:
#for words in test_dataset['text']:
 #   vocab_test.update(words.split())

In [133]:
#len(vocab_test)

In [134]:
#for words in test_dataset['keyword']:
 #   vocab_test.update(words.split())

In [135]:
#len(vocab_test)

In [136]:
#for words in test_dataset['location']:
 #   vocab_test.update(words.split())

In [137]:
#len(vocab_test)

In [138]:
## Combined the columns of test dataset

In [139]:
combined_test_pd= test_dataset_keyword+" "+test_dataset_location+" "+test_dataset_text
combined_test_pd

0           epicentr quezon citi happen terribl car crash
1       epicentr quezon citi heard earthquak differ ci...
2       epicentr quezon citi forest fire spot pond gee...
3       epicentr quezon citi apocalyps light spokan wi...
4       epicentr quezon citi typhoon soudelor kill chi...
                              ...                        
3258    epicentr quezon citi earthquak safeti lo angel...
3259    epicentr quezon citi storm ri wors last hurric...
3260       epicentr quezon citi green line derail chicago
3261    epicentr quezon citi meg issu hazard weather o...
3262    epicentr quezon citi cityofcalgari activ munic...
Length: 3263, dtype: object

In [140]:
import numpy as np

def vectorizer_test(ds, vocabulary):
    vectorizer_list = []
    vocab_set=set(vocabulary)
    for sentence in ds:
        sentence_list = np.zeros(len(vocabulary), dtype=np.float16)  # Create a zero array for each sentence
        sentence_words = set(sentence.split())  # Convert the sentence into a set of words
        
        for i, word in enumerate(vocabulary):  # Iterate over the vocabulary (list, ordered)
            if word in sentence_words:  # Check if the word exists in the sentence
                sentence_list[i] = 1  # Set the corresponding index to 1
                
        vectorizer_list.append(sentence_list)
    
    # Convert the list of arrays to a numpy array
    vectorizer_list_new = np.asarray(vectorizer_list, dtype=np.float16)
    
    return vectorizer_list_new


In [141]:
vectorizer_test = vectorizer_test(combined_test_pd, vocab_train)

In [142]:
vectorizer_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float16)

In [143]:
## store the ids for later use

In [144]:
id =test_dataset['id']
id

0           0
1           2
2           3
3           9
4          11
        ...  
3258    10861
3259    10865
3260    10868
3261    10874
3262    10875
Name: id, Length: 3263, dtype: int64

In [145]:
## Make the predictions

In [150]:
rf_clf_predictions = rf_clf.predict(vectorizer_test)
rf_clf_predictions[100:110]



array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0], dtype=int64)

In [151]:

# Create submission DataFrame
submission_df = pd.DataFrame({
    "id": id,
    "target":rf_clf_predictions
})

submission_df.head(20)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [152]:
# # Save submission file
submission_df.to_csv("model_results.csv", index=False)

print("Submission file saved successfully.")

Submission file saved successfully.
