In [6]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
from collections import  Counter, defaultdict
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop=set(stopwords.words('english'))
import re
from wordcloud import WordCloud
import emoji
from spellchecker import SpellChecker
from textblob import TextBlob
from mlxtend.plotting import plot_confusion_matrix
from nltk.stem import PorterStemmer 
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import string
from package.Clean_data import PreProcessTweets

In [7]:
#reading the data
tweet_train= pd.read_csv('train.csv')
tweet_test=pd.read_csv('test.csv')
tweet_train.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [8]:
# Number of Links in a tweet
tweet_train['numoflinks']=tweet_train['text'].apply(lambda x: len(re.findall(r"http(\w+)", x)))
tweet_test['numoflinks']=tweet_test['text'].apply(lambda x: len(re.findall(r"http(\w+)", x)))

In [9]:
# HashTag Analysis
def find_numhashtags(tweet):
    hashtag_list=re.findall(r"#(\w+)", tweet)
    return len(hashtag_list)

def find_hashtags(tweet):
    #multiple hashtags case - ?
    #hashtags in the website case
    hashtag_list=re.findall(r"#(\w+)", tweet)
    return ', '.join([x for x in hashtag_list])

def return_hashtag_keyword(tweet):
    hashtag_list=re.findall(r"#(\w+)", tweet)
    return hashtag_list[0]

tweet_train['hashtags']=tweet_train['text'].apply(lambda x: find_hashtags(x))
tweet_train['numofhashtags']=tweet_train['text'].apply(lambda x: find_numhashtags(x))

tweet_test['hashtags']=tweet_test['text'].apply(lambda x: find_hashtags(x))
tweet_test['numofhashtags']=tweet_test['text'].apply(lambda x: find_numhashtags(x))

In [10]:
pr=PreProcessTweets()
tweet_train['text']=tweet_train['text'].apply(lambda x: pr.clean_tweet(x))
tweet_test['text']=tweet_test['text'].apply(lambda x: pr.clean_tweet(x))

In [11]:
tweet_train['keyword'].fillna(tweet_train['hashtags'].apply(lambda x: np.NaN if x=='' else x.split(',')[-1].strip()), inplace=True)
tweet_test['keyword'].fillna(tweet_test['hashtags'].apply(lambda x: np.NaN if x=='' else x.split(',')[-1].strip()), inplace=True)
tweet_train['keyword'].fillna('the',inplace=True)
tweet_test['keyword'].fillna('the',inplace=True)

In [12]:
tweet_train['stem_key']=tweet_train['keyword'].apply(lambda x: pr.clean_keywords(x))
tweet_test['stem_key']=tweet_test['keyword'].apply(lambda x: pr.clean_keywords(x))

In [13]:
"""
feature 1 - word length
feature 2 - number of links
feature 3 - keyword weight
"""

# feature 1 - word length
tweet_train['avg_word_len']=tweet_train['text'].apply(lambda x: np.average([len(i) for i in x.split()]))
tweet_test['avg_word_len']=tweet_test['text'].apply(lambda x: np.average([len(i) for i in x.split()]))
tweet_train['avg_word_len'].fillna(0,inplace=True)
tweet_test['avg_word_len'].fillna(0,inplace=True)

# feature 2 - number of links - already done

# feature 3 - keyword weight - should be normalized #
def keyword_weight(kw,df):
    freq1class=df[(df['stem_key']==kw)&(df['target']==1)].shape[0]
    freq0class=df[(df['stem_key']==kw)&(df['target']==0)].shape[0]
    if freq0class==0:
        return freq1class
    else:
        return freq1class/freq0class
tweet_train['keyword_weight']=tweet_train['stem_key'].apply(lambda x: keyword_weight(x,tweet_train))
kw_dict=pd.Series(tweet_train.keyword_weight.values,index=tweet_train.stem_key).to_dict()
tweet_test['keyword_weight']=tweet_test['stem_key'].apply(lambda x: kw_dict[x] if x in list(kw_dict.keys()) else 0)

  avg = a.mean(axis)


In [14]:
#logreg
columns = ['avg_word_len', 'numoflinks','keyword_weight']
train_df=tweet_train[columns]
test_df=tweet_test[columns]
logreg=LogisticRegression(verbose=1,random_state=0, C=5, penalty='l2')
model=logreg.fit(train_df,tweet_train['target'])
print("train accuracy")
train_pred=model.predict(train_df)
print(accuracy_score(tweet_train['target'], train_pred))

[LibLinear]train accuracy
0.7171942729541574




In [15]:
# save the model and dictionary
import pickle
pickle.dump(model, open("base_model5.pkl", 'wb'))
pickle.dump(kw_dict,open("keyword_dict.pkl",'wb'))

In [16]:
tweet_train['pred']=train_pred

In [17]:
tweet_train.head()

Unnamed: 0,id,keyword,location,text,target,numoflinks,hashtags,numofhashtags,stem_key,avg_word_len,keyword_weight,pred
0,1,earthquake,,deeds reason earthquake may allah forgive,1,0,earthquake,1,earthquak,6.0,3.444444,1
1,4,the,,forest fire near ronge sask canada,1,0,,0,the,4.833333,1.277778,0
2,5,the,,residents asked shelter place notified officer...,1,0,,0,the,7.090909,1.277778,0
3,6,wildfires,,people receive wildfires evacuation orders cal...,1,0,wildfires,1,wildfir,8.0,8.0,1
4,7,wildfires,,got sent photo ruby alaska smoke wildfires pou...,1,0,"Alaska, wildfires",2,wildfir,5.222222,8.0,1
