In [2]:
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
import re
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [3]:
data = pd.read_csv("F:/Git_projects/natural language processing/train.csv")
test_data = pd.read_csv("F:/Git_projects/natural language processing/test.csv")

In [4]:
data.groupby("target").nunique()

Unnamed: 0_level_0,id,keyword,location,text
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4342,218,2142,4315
1,3271,220,1513,3206


In [5]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
y = data["target"].to_numpy()

In [8]:
data["keyword"] = data["keyword"].replace(np.nan,".")
data["location"] = data["location"].replace(np.nan,".")
test_data["keyword"] = test_data["keyword"].replace(np.nan,".")
test_data["location"] = test_data["location"].replace(np.nan,".")

In [23]:
stop_words = set(stopwords.words('english'))
stop_words.update(['the'],['http'],['https'])

In [24]:
def preprocessing(dataframe):
    tweets = []
    dataframe["text"] = dataframe["text"].str.cat(dataframe["keyword"],sep = " ")
    dataframe["text"] = dataframe["text"].str.cat(dataframe["location"],sep = " ")
    for tweet in range(0,len(dataframe)):
        temp = dataframe["text"][tweet].lower()
        temp = re.sub(r'\W',' ',temp)
        temp = re.sub(r'\s+[a-zA-Z]\s+',' ',temp)
        temp = re.sub(r'\^[a-zA-Z]\s+',' ',temp)
        temp = re.sub(r'\s+',' ',temp)
        tweets.append(temp)
    return tweets

In [25]:
def vectorize(tweets):
    vect = TfidfVectorizer(max_features=1550, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
    X = vect.fit_transform(tweets).toarray()
    return X

In [26]:
pro_tweets = preprocessing(data)
pro_test_tweets = preprocessing(test_data)
vect_tweets = vectorize(pro_tweets)
vect_test_tweets = vectorize(pro_test_tweets)

In [27]:
X_train,X_test,y_train,y_test = train_test_split(vect_tweets,y,test_size = 0.2,stratify = y,random_state = 666)

In [28]:
classifier = PassiveAggressiveClassifier(C= 0.5,random_state = 666)
classifier.fit(X_train,y_train)
predictions = classifier.predict(X_test)

In [29]:
print("Estimator : PAssiveAggressiveClassififer")
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

Estimator : PAssiveAggressiveClassififer
[[638 231]
 [153 501]]
              precision    recall  f1-score   support

           0       0.81      0.73      0.77       869
           1       0.68      0.77      0.72       654

    accuracy                           0.75      1523
   macro avg       0.75      0.75      0.75      1523
weighted avg       0.75      0.75      0.75      1523



In [14]:
c1= GaussianNB() 
c2= BernoulliNB()
c3= DecisionTreeClassifier(max_depth = 5,random_state = 666)
c4= SVC(kernel = "rbf",random_state = 666)
c5= RandomForestClassifier(random_state = 666)
VC = VotingClassifier(estimators = [("GNB",c1),("BNB",c2),("DTC",c3),("SVM",c4),("RFC",c5)],voting = "hard")

In [83]:
VC.fit(X_train,y_train)
VC_predictions = VC.predict(X_test)
print("Estimator: Voting System")
print(confusion_matrix(y_test,VC_predictions))
print(classification_report(y_test,VC_predictions))

Estimator: Voting System
[[750 119]
 [214 440]]
              precision    recall  f1-score   support

           0       0.78      0.86      0.82       869
           1       0.79      0.67      0.73       654

    accuracy                           0.78      1523
   macro avg       0.78      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523



In [15]:
VC_test_predictions = VC.predict(vect_test_tweets)

In [24]:
test_data["target"] = VC_test_predictions

In [26]:
submission =test_data[["id","target"]]

In [34]:
submission.to_excel("submission.xls",index = False)