# Supervised Modeling

In [1]:
import pandas as pd
import nltk
import numpy as np
import regex as re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

  import pandas.util.testing as tm


In [2]:
df = pd.read_csv('./Datasets/big_df.csv')

### Road Closure Classification

In [3]:
df.head(5)

Unnamed: 0,screen_name,username,user_id,tweet_id,tweet_url,timestamp,timestamp_epochs,text,text_html,links,...,img_urls,video_url,likes,retweets,replies,is_replied,is_reply_to,parent_tweet_id,reply_to_users,query
0,EPCFIRM,EPCF #GreenNewDeal #SunriseMovement,836834627771850752,1188564512669962240,/EPCFIRM/status/1188564512669962240,2019-10-27 21:13:56,1572210836,Our hearts go out to Californians affected by ...,"<p class=""TweetTextSize js-tweet-text tweet-te...",[],...,['https://pbs.twimg.com/media/EH6hH4FVAAA6ZA_....,,8,1,1,True,False,,[],saddleridgefire
1,jsingpubhealth,Jason Singson,3320904614,1188545671076298752,/jsingpubhealth/status/1188545671076298752,2019-10-27 19:59:04,1572206344,"From the #SaddleridgeFire to the #KincadeFire,...","<p class=""TweetTextSize js-tweet-text tweet-te...",['https://twitter.com/CAgovernor/status/118852...,...,[],,1,0,0,False,False,,[],saddleridgefire
2,BrandyCGrote,Brandy Grote,898902973,1188517925503635456,/BrandyCGrote/status/1188517925503635456,2019-10-27 18:08:49,1572199729,Our #SaddleridgeFire leaped a 12 lane major fr...,"<p class=""TweetTextSize js-tweet-text tweet-te...",[],...,[],,1,1,0,False,True,1.188496e+18,"[{'screen_name': 'BillMendocino', 'user_id': '...",saddleridgefire
3,mlzema,Mark In SoCal,18340526,1188515108562931714,/mlzema/status/1188515108562931714,2019-10-27 17:57:38,1572199058,"Good Morning, Sam!\nAll is well here. However,...","<p class=""TweetTextSize js-tweet-text tweet-te...",['https://twitter.com/Sam17970863/status/11885...,...,[],,1,0,0,False,False,,[],saddleridgefire
4,LVTIA,LVTIA,87142711,1188502337867829248,/LVTIA/status/1188502337867829248,2019-10-27 17:06:53,1572196013,Was your property or home damaged by the #Sadd...,"<p class=""TweetTextSize js-tweet-text tweet-te...",['https://www.211la.org/fire-saddleridge'],...,[],,0,0,0,False,False,,[],saddleridgefire


In [4]:
df.columns

Index(['screen_name', 'username', 'user_id', 'tweet_id', 'tweet_url',
       'timestamp', 'timestamp_epochs', 'text', 'text_html', 'links',
       'hashtags', 'has_media', 'img_urls', 'video_url', 'likes', 'retweets',
       'replies', 'is_replied', 'is_reply_to', 'parent_tweet_id',
       'reply_to_users', 'query'],
      dtype='object')

In [5]:
## Drop unnecessary columns 
df_text_query = df.drop(columns= ['screen_name', 'user_id', 'tweet_id', 'tweet_url', 'timestamp', 'timestamp_epochs', 'text_html', 'links', 'hashtags', 'has_media', 'img_urls', 'video_url', 'likes', 'retweets', 'replies', 'is_replied', 'is_reply_to', 'parent_tweet_id', 'reply_to_users'])

### Check for Null Values

In [6]:
df_text_query.isnull().sum()

username      0
text        105
query         0
dtype: int64

In [7]:
df_text_query.dropna(axis=0, how='any', inplace=True)

In [8]:
df_text_query.head()

Unnamed: 0,username,text,query
0,EPCF #GreenNewDeal #SunriseMovement,Our hearts go out to Californians affected by ...,saddleridgefire
1,Jason Singson,"From the #SaddleridgeFire to the #KincadeFire,...",saddleridgefire
2,Brandy Grote,Our #SaddleridgeFire leaped a 12 lane major fr...,saddleridgefire
3,Mark In SoCal,"Good Morning, Sam!\nAll is well here. However,...",saddleridgefire
4,LVTIA,Was your property or home damaged by the #Sadd...,saddleridgefire


In [9]:
df_text_query.isnull().sum()

username    0
text        0
query       0
dtype: int64

In [10]:
df_text_query.shape

(9075, 3)

### Cleaning Tweets using RegEx

In [11]:
## Lower case texts
df_text_query['text'] = df_text_query['text'].str.lower()

In [12]:
## Removing line breaks
df_text_query['text'] = df_text_query['text'].map(lambda x: re.sub('\/\/', ' ', x)) 

In [13]:
## Removing Apostrophes
df_text_query['text'] = df_text_query['text'].map(lambda x: re.sub('[\\][\']', '', x))

In [14]:
## Removing URLs
df_text_query['text'] = df_text_query['text'].map(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))

In [15]:
df_text_query.head(5)

Unnamed: 0,username,text,query
0,EPCF #GreenNewDeal #SunriseMovement,our hearts go out to californians affected by ...,saddleridgefire
1,Jason Singson,"from the #saddleridgefire to the #kincadefire,...",saddleridgefire
2,Brandy Grote,our #saddleridgefire leaped a 12 lane major fr...,saddleridgefire
3,Mark In SoCal,"good morning, sam!\nall is well here. however,...",saddleridgefire
4,LVTIA,was your property or home damaged by the #sadd...,saddleridgefire


### Road Closure Classification

#### We want to determine if a road was open or closed. We achieved this by passing our dataframe through the filters we created

In [16]:
# establish list of words to search for in dataframe
road_keywords = ['road', 'st','rd', 'hwy', 'highway', 'ave', 'avenue', 'street', 'boulevard'
                 'intersection', 'bridge', 'sr-', 'cr-', 'us-', 'i-', 'blvd']

# list of words associated with road closures
closed_keywords = ['closed', 'remains closed', 'shut down', 'backed up',
                   'no travel', 'delay', 'blocked', 'delays',
                   'disabled', 'traffic', 'fire', 'closures', 'closure']

to_drop = ["open", "opened", "lifted", "reopened", "clear", "cleared"]

In [17]:
df_text_query['is_road_closed']= df_text_query['text'].map(lambda x: 1 if ((any(word in x for word in road_keywords))
                                                           & (any(word in x for word in closed_keywords))
                                                           & (not any(word in x for word in to_drop))
                                                          ) else 0)

In [18]:
df_text_query.head()

Unnamed: 0,username,text,query,is_road_closed
0,EPCF #GreenNewDeal #SunriseMovement,our hearts go out to californians affected by ...,saddleridgefire,1
1,Jason Singson,"from the #saddleridgefire to the #kincadefire,...",saddleridgefire,1
2,Brandy Grote,our #saddleridgefire leaped a 12 lane major fr...,saddleridgefire,0
3,Mark In SoCal,"good morning, sam!\nall is well here. however,...",saddleridgefire,1
4,LVTIA,was your property or home damaged by the #sadd...,saddleridgefire,0


In [19]:
df_text_query['is_road_closed'].value_counts()

0    5867
1    3208
Name: is_road_closed, dtype: int64

In [20]:
## assign road closed or open dataframes
closed_road = df_text_query[df_text_query['is_road_closed'] == 1]
open_road = df_text_query[df_text_query['is_road_closed'] == 0]

In [21]:
closed_road.shape

(3208, 4)

In [22]:
open_road.shape

(5867, 4)

In [23]:
# find which handles had information about road closures
closed_road['username'].value_counts()

Go511                          557
Caltrans District 7            137
Brian Douglas                  132
Desmond Shaw                   117
Spectrum News 1 SoCal           87
                              ... 
NLACRC                           1
A g n ė ♡                        1
Colleen McDonnell                1
Lorna Paul                       1
CA AmeriCorps Disaster Team      1
Name: username, Length: 910, dtype: int64

In [25]:
closed_road.shape

(3208, 4)

## Declaring a Tokenizer Function:

In [33]:
tokenizer = RegexpTokenizer(r'\s+', gaps=True)

def tokenizing_function(df_col):

    string = ''
    
    try:
    
        for post in df_col:
            string += ' ' + post
            
    except:
        pass
    
    all_tokens = tokenizer.tokenize(string)
    
    return all_tokens

### Calling Tokenizer Function on `text` Column and Saving as Variable

In [35]:
text_tokens = tokenizing_function(df['hashtags'])
text_tokens

["['wildfires',",
 "'ClimateChange',",
 "'ClimateCrisis',",
 "'MinesFire',",
 "'TickFire',",
 "'KincadeFire',",
 "'SaddleRidgeFire',",
 "'RawsonFire',",
 "'SkyFire',",
 "'SawdayFire',",
 "'MillerFire',",
 "'PalisadesFirepic']",
 "['SaddleridgeFire',",
 "'KincadeFire']",
 "['SaddleridgeFire']",
 "['SaddleridgeFire']",
 "['SaddleRidgeFire']",
 "['SaddleridgeFire']",
 "['KincadeFire',",
 "'KincadeFire',",
 "'KincadeFire',",
 "'KincadeFire',",
 "'KincadeFire']",
 "['sabadoencasa',",
 "'SABADODELUXE',",
 "'saturdaynight',",
 "'Sabadosad',",
 "'SaddleridgeFire',",
 "'sadboy']",
 "['porterranch',",
 "'SaddleRidgeFire']",
 "['wildfires',",
 "'ClimateChange',",
 "'ClimateCrisis',",
 "'MinesFire',",
 "'TickFire',",
 "'KincadeFire',",
 "'SaddleRidgeFire',",
 "'RawsonFire',",
 "'SkyFire',",
 "'SawdayFire',",
 "'MillerFire',",
 "'PalisadesFirepic']",
 "['SaddleridgeFire',",
 "'KincadeFire']",
 "['SaddleridgeFire']",
 "['SaddleridgeFire']",
 "['SaddleRidgeFire']",
 "['SaddleridgeFire']",
 "['Kincade

## Declaring Count Vectorizer Function:

In [30]:
cvec = CountVectorizer(stop_words= 'english', min_df=5)

def cvec_function(df_col):
    cvec_matrix = cvec.fit_transform(df_col)

    #Convert to DataFrame
    cvec_df = pd.DataFrame(cvec_matrix.toarray(),
                          columns=cvec.get_feature_names())
    return cvec_df

### Calling the cvec_function with `text` column and saving the matrix as `text_cvec_df` variable

In [32]:
text_cvec_df = cvec_function(df_text_query['text'])

### Viewing Top 50 Words in `text` column:

In [36]:
text_cvec_df.sum().sort_values(ascending=False).head(50)

com                4822
twitter            3648
https              1988
pic                1714
saddleridgefire    1402
knx1070            1289
status             1041
10                  958
la                  835
tickfire            788
traffic             759
news                651
ca                  648
2019                641
http                626
lacofdpio           571
thank               559
west                540
county              535
closed              514
lanes               514
specnews1socal      511
update              494
rd                  492
www                 483
california          475
just                475
ly                  464
open                454
lafd                446
los                 443
bit                 442
caltransdist7       425
angeles             422
blvd                408
acres               393
south               384
canyon              383
knxtraffic          377
sigalert            377
spectrumnews1       376
sr              