In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import string

DT = pd.read_csv(r'C:\Users\MY\Downloads\Datasets\DisasterTweets\train.csv')
DT.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [2]:
DT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
DT.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [4]:
DT['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [5]:
Counter(DT['location']).most_common()[1]

('USA', 104)

In [6]:
DT['location'].mode()[0]

'USA'

In [7]:
MaxLoc = DT['location'].mode()[0]

In [8]:
DT['location'].fillna(MaxLoc, inplace = True)

In [9]:
Counter(DT['keyword']).most_common()[1]

('fatalities', 45)

In [10]:
def URL(tweet):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', tweet)

def HTML(tweet):
    html = re.compile(r'<.*?>')
    return html.sub(r'', tweet)

def emoji(tweet):
    emojis = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emojis.sub(r'', tweet)

def punct(tweet):
    puncts = tweet.maketrans('', '', string.punctuation)
    return tweet.translate(puncts)

def case(tweet):
    tweet = tweet.lower()
    return tweet

In [11]:
DT['text'] = DT['text'].apply(lambda x : URL(x))
DT['text'] = DT['text'].apply(lambda x : HTML(x))
DT['text'] = DT['text'].apply(lambda x : emoji(x))
DT['text'] = DT['text'].apply(lambda x : punct(x))
DT['text'] = DT['text'].apply(lambda x : case(x))

In [16]:
from rake_nltk import Rake

extractor = Rake()

def fillk(w):
    if(pd.isnull(w['keyword'])):
        extractor.extract_keywords_from_text(w['text'])
        key_extracted = extractor.get_ranked_phrases()[0]
        print(key_extracted)
        w['keyword'] = key_extracted 
        
    else:
        w = w
        
    return w

DT = DT.apply(fillk, axis = 1)            

In [13]:
DT.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7613 non-null   object
 2   location  7613 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [14]:
DT.isnull().sum()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

countvec = CountVectorizer(strip_accents = 'ascii', stop_words = stopwords, ngram_range = (1, 2))
tfidf = TfidfVectorizer(strip_accents = 'ascii', stop_words = stopwords, ngram_range = (1, 2))

In [14]:
countfeatures = countvec.fit_transform(DT['text'])
countfeatures = pd.DataFrame(countfeatures.toarray())
countfeatures.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
5599,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
794,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5991,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5679,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
872,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3970,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
777,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [15]:
tfidffeatures = tfidf.fit_transform(DT['text'])
tfidffeatures = pd.DataFrame(tfidffeatures.toarray())
tfidffeatures.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
3171,0.452378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.587141,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
y = DT['target']

In [17]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train1, X_test1, y_train1, y_test1 = train_test_split(countfeatures, y, test_size = 0.2, random_state = 1)
X_train2, X_test2, y_train2, y_test2 = train_test_split(tfidffeatures, y, test_size = 0.2, random_state = 1)

In [18]:
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

#modeltest = RandomForestClassifier()
#modelc = svm.SVC(kernel = 'linear', C = 1.0).fit(X_train1, y_train1)
#predictions = modelc.predict(X_test1)
#ac = accuracy_score(predictions, y_test1)
#print(ac)

In [19]:
#from sklearn import svm
#from sklearn.metrics import accuracy_score
#modelt = svm.SVC(kernel = 'linear', C = 1.0).fit(X_train2, y_train2)
#predictions = modelt.predict(X_test2)
#ac = accuracy_score(predictions, y_test2)
#print(ac)

In [20]:
test = pd.read_csv(r'C:\Users\MY\Downloads\Datasets\DisasterTweets\test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [21]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [22]:
test.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [23]:
Counter(test['location']).most_common(2)

[(nan, 1105), ('New York', 38)]

In [24]:
test['location'].mode()[0]

'New York'

In [25]:
MaxLoct = test['location'].mode()[0]

In [26]:
test['location'].fillna(MaxLoct, inplace = True) 

In [27]:
def fillkt(t):
    if(pd.isnull(t['keyword'])):
        extractor.extract_keywords_from_text(t['text'])
        key_extracted1 = extractor.get_ranked_phrases()[:1]
        print(key_extracted1)
        t['keyword'] = key_extracted1
        
    else:
        t = t
        
    return t

test = test.apply(fillkt, axis = 1)

['terrible car crash']
['stay safe everyone']
['spot pond']
['apocalypse lighting']
['typhoon soudelor kills 28']
['shaking ...']
['probably still show']
['hey']
['nice hat']
['fuck']
['like cold']
['nooooooooo']
['tell']
[]
['awesome']
['possible exposure officials say http ://']
['missing flight mh370 http ://']
['possible ebola case']
['harun ìàekdar ... http ://']
['please look']
['suicide car bombing']
['earthquake safety los angeles \x89 ûò safety fasteners xrwn']
['around 20000k still without power']
['green line derailment']
['meg issues hazardous weather outlook']
['municipal emergency plan']


In [28]:
test.isnull().sum()

id          0
keyword     0
location    0
text        0
dtype: int64

In [29]:
testfeatures = countvec.fit_transform(test['text'])
testfeatures = pd.DataFrame(testfeatures.toarray())
testfeatures.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
2369,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
386,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
116,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
665,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
modelc = svm.SVC(kernel = 'linear', C = 1.0, gamma = 'scale')
modelc.fit(countfeatures, y)
predictions = modelc.predict(testfeatures)
print(predictions)

[1 0 0 ... 0 0 0]


In [31]:
ID = pd.Series(test['id'])
targets = pd.Series(predictions)

data = pd.concat([ID, targets], axis = 1)

In [32]:
df = pd.DataFrame(data).reset_index()
df.columns = ['Index', 'id', 'target']
df = df.drop(['Index'], axis = 1)
df.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,0
3,9,0
4,11,0


In [33]:
df['target'].value_counts()

0    2483
1     780
Name: target, dtype: int64

In [34]:
#df.to_csv(r'C:\Users\MY\Downloads\Datasets\DisasterTweets\SVlinear9.csv', index = False)