In [42]:
    #General
import pandas as pd
import numpy as np


    #Plotting
import matplotlib.pyplot as plt
import seaborn as sns

    #nltk and regex packages
import regex as re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

    #Word Vectors
import gensim


    #Sklearn Packages
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words, text

%config InlineBackend.figure_format = 'retina'

In [75]:
df = pd.read_csv("./Datasets/big_df.csv")

In [76]:
df.head(2)

Unnamed: 0,screen_name,username,user_id,tweet_id,tweet_url,timestamp,timestamp_epochs,text,text_html,links,...,img_urls,video_url,likes,retweets,replies,is_replied,is_reply_to,parent_tweet_id,reply_to_users,query
0,EPCFIRM,EPCF #GreenNewDeal #SunriseMovement,836834627771850752,1188564512669962240,/EPCFIRM/status/1188564512669962240,2019-10-27 21:13:56,1572210836,Our hearts go out to Californians affected by ...,"<p class=""TweetTextSize js-tweet-text tweet-te...",[],...,['https://pbs.twimg.com/media/EH6hH4FVAAA6ZA_....,,8,1,1,True,False,,[],saddleridgefire
1,jsingpubhealth,Jason Singson,3320904614,1188545671076298752,/jsingpubhealth/status/1188545671076298752,2019-10-27 19:59:04,1572206344,"From the #SaddleridgeFire to the #KincadeFire,...","<p class=""TweetTextSize js-tweet-text tweet-te...",['https://twitter.com/CAgovernor/status/118852...,...,[],,1,0,0,False,False,,[],saddleridgefire


In [77]:
df.columns

Index(['screen_name', 'username', 'user_id', 'tweet_id', 'tweet_url',
       'timestamp', 'timestamp_epochs', 'text', 'text_html', 'links',
       'hashtags', 'has_media', 'img_urls', 'video_url', 'likes', 'retweets',
       'replies', 'is_replied', 'is_reply_to', 'parent_tweet_id',
       'reply_to_users', 'query'],
      dtype='object')

#### Creating a DateFrame with Just Text and Query Columns

In [78]:
text_df = df[['text', 'query', 'hashtags']]

In [79]:
text_df.head()

Unnamed: 0,text,query,hashtags
0,Our hearts go out to Californians affected by ...,saddleridgefire,"['wildfires', 'ClimateChange', 'ClimateCrisis'..."
1,"From the #SaddleridgeFire to the #KincadeFire,...",saddleridgefire,"['SaddleridgeFire', 'KincadeFire']"
2,Our #SaddleridgeFire leaped a 12 lane major fr...,saddleridgefire,['SaddleridgeFire']
3,"Good Morning, Sam!\nAll is well here. However,...",saddleridgefire,['SaddleridgeFire']
4,Was your property or home damaged by the #Sadd...,saddleridgefire,['SaddleRidgeFire']


In [80]:
text_df.shape

(9180, 3)

In [81]:
text_df.isnull().sum()

text        105
query         0
hashtags      0
dtype: int64

#### Dropping Rows With Nulls

In [91]:
text_df.dropna(inplace=True)
text_df.reset_index(drop = True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [92]:
text_df.isnull().sum()

text        0
query       0
hashtags    0
dtype: int64

In [93]:
text_df.iloc[1, :]['text']

'From the #SaddleridgeFire to the #KincadeFire, as well as many emerging small/large fires today, we have quickly seen how California can be brought to a halt. Public health/EP/EMS departments are going to be in overdrive. We must be realistic about our future.https://twitter.com/CAgovernor/status/1188523109051027456\xa0…'

#### Making the `text` lower case

In [99]:
text_df['text'] = [x.lower() for x in text_df['text']]
text_df['text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


0       our hearts go out to californians affected by ...
1       from the #saddleridgefire to the #kincadefire,...
2       our #saddleridgefire leaped a 12 lane major fr...
3       good morning, sam!\nall is well here. however,...
4       was your property or home damaged by the #sadd...
                              ...                        
9070    former state lawmaker says he's got a plan to ...
9071               totally makes sense to do this midday 
9072    a sanford, maine mom says her 9-year-old birac...
9073       i noticed it in anaheim hills, it looks crazy!
9074    the copyright owners of the "charlie brown chr...
Name: text, Length: 9075, dtype: object

#### Pulling Out Hashtags From Text

In [11]:
#Adapted from: https://gist.github.com/mahmoud/237eb20108b5805aed5f

# hashtag_re = re.compile("(?:^|\s)[＃#]{1}(\w+)", re.UNICODE)

#### Pulling out the hashtags for specified .iloc

In [12]:
# hashtag_re.findall(text_df.iloc[1, :]['text'])

In [13]:
# text_df['hashtags'] = None

# for n in range(len(text_df['text'])):
    
#     try: 
#         text_df['hashtags'] = hashtag_re.findall(text_df.iloc[n,:]['text'])
        
#     except:
#         pass

#### Removing pic.twitter.com and https://twitter.com stings 

In [14]:
# this pattern deletes pic.twitter.com and https://twitter.com strings


# z = text_df['text'][0]
del_pattern = '(pic\.twitter\.com.*)|(https:[\/][\/]twitter\.com.*)'
text_df['text'] = text_df['text'].apply(lambda x: re.sub(del_pattern, '', x))

# re.sub(del_pattern, '', z)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
z = text_df['text'][0]
re.sub(del_pattern, '', z)

'Our hearts go out to Californians affected by #wildfires. #ClimateChange is real.  Scientists say the #ClimateCrisis is fueling wildfires nationwide. #MinesFire @nytimes\n#TickFire #KincadeFire #SaddleRidgeFire #RawsonFire #SkyFire #SawdayFire #MillerFire #PalisadesFire'

In [16]:
# deleting the pic.twitter.com and https://twitter.com strings form text_df['text']
# text_df['text'] = text_df['text'].apply(lambda x: re.sub(del_pattern, '', x))

In [17]:
# This is to grab all mentions that start with an '@'
pattern_mentions = "[\@][\w]+"

In [18]:
# Converting 'text' column as strings, just as a precaution
text_df['text'] = text_df['text'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [19]:
# Creating a 'mentions' column that has each text's mentions
text_df['mentions'] = text_df['text'].apply(lambda x: ' '.join(re.findall(pattern_mentions, x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [20]:
text_df

Unnamed: 0,text,query,hashtags,mentions
0,Our hearts go out to Californians affected by ...,saddleridgefire,"['wildfires', 'ClimateChange', 'ClimateCrisis'...",@nytimes
1,"From the #SaddleridgeFire to the #KincadeFire,...",saddleridgefire,"['SaddleridgeFire', 'KincadeFire']",
2,Our #SaddleridgeFire leaped a 12 lane major fr...,saddleridgefire,['SaddleridgeFire'],
3,"Good Morning, Sam!\nAll is well here. However,...",saddleridgefire,['SaddleridgeFire'],
4,Was your property or home damaged by the #Sadd...,saddleridgefire,['SaddleRidgeFire'],@211LACounty
...,...,...,...,...
9070,Former state lawmaker says he's got a plan to ...,KNX1070,[],@mikegatto @KNX1070
9071,Totally makes sense to do this midday,KNX1070,[],
9072,"A Sanford, Maine mom says her 9-year-old birac...",KNX1070,[],
9073,"I noticed it in Anaheim Hills, it looks crazy!",KNX1070,[],


In [21]:
# hashtag_re.findall(text_df.iloc[0,:]['text'])

#### Pulling out the Mentions

In [22]:
#Adapted from: https://gist.github.com/mahmoud/237eb20108b5805aed5f

mention_re = re.compile("(?:^|\s)[＠ @]{1}([^\s#<>[\]|{}]+)", re.UNICODE)

#### Testing the Mentions on One Row

In [23]:
text_df.text[249]

'#RT @CAL_FIRE: #SaddleRidgeFire off Saddle Ridge Rd, Sylmar in Los Angeles County is 8,799 acres and 95% contained. Unified Command: @LAFD, @LACOFD and @Angeles_NF'

In [24]:
mention_re.findall(text_df.iloc[249,:]['text'])

['CAL_FIRE:', 'LAFD,', 'LACOFD', 'Angeles_NF']

#### Creating a Mentions Column Where the Default is None

In [25]:
text_df['mentions'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [26]:
# # [tokenizer.tokenize(posts) for posts in text_df['hashtags']]

# for posts in text_df['hashtags']:

#     try:
#         tokenizer.tokenize(posts)
#     except:
#         pass

# # women_post_tokens = [tokenizer.tokenize(posts) for posts in df[df['subreddit_target'] ==1]['all_text'] ]

In [27]:
text_df['hashtags']

0       ['wildfires', 'ClimateChange', 'ClimateCrisis'...
1                      ['SaddleridgeFire', 'KincadeFire']
2                                     ['SaddleridgeFire']
3                                     ['SaddleridgeFire']
4                                     ['SaddleRidgeFire']
                              ...                        
9070                                                   []
9071                                                   []
9072                                                   []
9073                                                   []
9074                                                   []
Name: hashtags, Length: 9075, dtype: object

--- 

---

## Tokenizing

#### Declaring Tokenizing Function:

In [34]:
tokenizer = RegexpTokenizer(r'\s+', gaps=True)

def tokenizing_function(col):

    string = ''
    
    try:
    
        for post in df[col]:
            string += ' ' + post
            
    except:
        pass
    
    all_tokens = tokenizer.tokenize(string)
    
    return all_tokens

hash_tokens = tokenizing_function('hashtags')

In [35]:
hash_tokens = tokenizing_function('hashtags')

## CVEC

### Declaring CVEC Function:

In [36]:
cvec = CountVectorizer(stop_words= 'english', min_df=5)

def cvec_function(col):
    col_cvec_matrix = cvec.fit_transform(text_df[col])

    #Convert to DataFrame
    col_cvec_df = pd.DataFrame(col_cvec_matrix.toarray(),
                          columns=cvec.get_feature_names())
    return col_cvec_df

cvec_function('text')

Unnamed: 0,00,000,00001582_saddleridge,00am,00pm,01,02,0297,03,04,...,yucaipa,yup,yxfgt6nc,zacktawataritv,zacktawataritvhttps,zero,zip,zone,zones,zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9070,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9072,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9073,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Calling CVEC function with `text` column:

In [100]:
text_cvec = cvec_function('text')

### Top 50 words in text

In [168]:
top_words = text_cvec.sum().sort_values(ascending=False).head(50)

In [167]:
# Import Word2Vec
from gensim.models.word2vec import Word2Vec

# If you want to use gensim's data, import their downloader
# and load it.
import gensim.downloader as api
corpus = api.load('text8')

# If you have your own iterable corpus of cleaned data, you can 
# read it in as corpus and pass that in.

# Train a model! 
model = Word2Vec(corpus,      # Corpus of data.
                 size=100,    # How many dimensions do you want in your word vector?
                 window=5,    # How many "context words" do you want?
                 min_count=1, # Ignores words below this threshold.
                 sg=0,        # SG = 1 uses SkipGram, SG = 0 uses CBOW (default).
                 workers=4)   # Number of "worker threads" to use (parallelizes process).

# Do what you'd like to do with your data!
model.most_similar("car")



[('driver', 0.7992774844169617),
 ('taxi', 0.7635014057159424),
 ('cars', 0.7435533404350281),
 ('truck', 0.7435256242752075),
 ('motorcycle', 0.7088671922683716),
 ('vehicle', 0.6889256238937378),
 ('racing', 0.6746527552604675),
 ('passenger', 0.6717368960380554),
 ('stock', 0.6547423005104065),
 ('trucks', 0.6501966714859009)]

### Displaying the most similar to all the top words:

In [176]:
for words in top_words.index:
    try:
        print("word:", words, model.most_similar(words))
        print("-------")
    except:
        pass

  This is separate from the ipykernel package so we can avoid doing imports until


word: com [('org', 0.8835672736167908), ('www', 0.8178542852401733), ('info', 0.8163278102874756), ('yahoo', 0.8129258155822754), ('http', 0.8124989867210388), ('faq', 0.8049891591072083), ('skydivingmovies', 0.7923030853271484), ('website', 0.7843674421310425), ('blog', 0.7700207233428955), ('archive', 0.7588902711868286)]
-------
word: twitter [('amminadab', 0.6934950947761536), ('covets', 0.6524245142936707), ('wraps', 0.6413664817810059), ('hephthalite', 0.6392537355422974), ('trekked', 0.6334550380706787), ('palatinates', 0.629637598991394), ('filipp', 0.6244513988494873), ('revolve', 0.6206804513931274), ('njessi', 0.6201038360595703), ('mindlessly', 0.6157722473144531)]
-------
word: https [('ftp', 0.8911775350570679), ('edu', 0.8650883436203003), ('gov', 0.8435155153274536), ('http', 0.8371896743774414), ('msn', 0.8367120027542114), ('openoffice', 0.8282307386398315), ('globalsecurity', 0.8268226385116577), ('utexas', 0.8258215188980103), ('whatsnew', 0.8199779987335205), ('ipt

## <span style ='color:red'> If we can do the above but with the hashtags, then re-query with the hashtags/mentions, that'd be so cool!!! Also, use the cleaning from the other notebook, possibly look at doing more EDA
 