## Work in Progress

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import glob
import os
import re
sns.set()
%matplotlib inline
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv('/kaggle/input/texas-winter-strom-2021-tweets/tweets_txwx2021.csv', index_col=None)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df[['user_name','text','created','user_followers']]

## Missing values

In [None]:
sns.heatmap(df.isnull())

In [None]:
sns.histplot(np.log1p(df['user_followers']),color='g')

In [None]:
df['tweet_length'] = df['text'].apply(lambda x : len(x))
sns.histplot(df['tweet_length'], color='g')

In [None]:
def clean_text(text):
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('<.*?>+', '', text)
    return text

In [None]:
df['clean_text'] = df['text'].apply(clean_text)

## Word Cloud of tweets from users with more than 100K followers

In [None]:
def generate_wc(fig_size=(10,10), 
                 text_series = df['text'], 
                 stop_words = None, 
                 max_font_size = 100,
                 max_words = 150,
                 background_color = 'white',
                 color_map = 'inferno',
                 inter_polation = 'bilinear'
                ):
    
    from wordcloud import WordCloud, STOPWORDS , ImageColorGenerator
    all_words = " ".join(word for word in text_series)
    fig, ax = plt.subplots(1, 1, figsize  = fig_size)
    # Create and generate a word cloud image:
    stopwords = set(STOPWORDS)
    if stop_words != None:
        stopwords.update(stop_words)
    wordcloud_ALL = WordCloud(max_font_size=max_font_size,
                              collocations=False,
                              stopwords=stopwords, 
                              max_words=max_words, 
                              background_color=background_color,
                              colormap=color_map).generate(all_words)
    # Display the generated image:
    ax.imshow(wordcloud_ALL, interpolation=inter_polation)
    ax.axis('off')

In [None]:
stop_words = ['RT','Texas','state','people','Texan','Texans','week','now','U','via','s','amp','let','one', 't', 'de','la','en','el', 'las','ola', 'winter','storm', 'will']
generate_wc(fig_size=(15,15), 
                 text_series = df['clean_text'][df['user_followers'] > 100000], 
                 stop_words = stop_words, 
                 max_font_size = 100,
                 max_words = 150,
                 background_color = 'white',
                 color_map = 'magma',
                 inter_polation = 'bilinear'
            )

## Word Cloud of all tweets

In [None]:
stop_words = ['m','ga','re','two','RT','Texas','state','people','Texan','Texans','week','now','U','via','s','amp','let','one', 't', 'de','la','en','el', 'las','ola', 'winter','storm', 'will']
generate_wc(fig_size=(15,15), 
                 text_series = df['clean_text'], 
                 stop_words = stop_words, 
                 max_font_size = 100,
                 max_words = 150,
                 background_color = '#f5f5f5',
                 color_map = 'twilight',
                 inter_polation = 'bilinear'
            )

ax.axis('off')