# Business Problem

# Business Understanding

# Supporting Research

# Data & Sources

[Kaggle Dataset on Covid-19 related tweets globally](https://www.kaggle.com/komalkhetlani/tweets-about-covid19-all-over-the-world)

# Proposal

To build a language bot that is capable of detecting tweets that may seem depressive or anxious that is capable of messaging users with location-based recommendations for free, low-cost, and paid-for mental health services in their area and/or to offer comforting suggestions.

As this is Covid-19 related, the model may have to adapt to drift as global situations change, but for now, we felt it to be relevant for at least the next several years.

# Data Understanding

# Exploratory Data Analysis

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify import ClassifierI
nltk.download('punkt')
nltk.download('tagsets')
nltk.help.upenn_tagset()
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer


import math
import string
import re

%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\svett\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\svett\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [3]:
dataset = pd.read_csv('data/TweetsAboutCovid-19.csv', low_memory=False)
dataset.head()

Unnamed: 0,id,created_at,date,time,timezone,place,tweet,language,replies_count,retweets_count,likes_count,hashtags,cashtags,retweet,video,thumbnail
0,1.38588e+18,2021-04-24 08:43:17 UTC,4/24/2021,8:43:17,0,,🇨🇺: ✍️ Covid-19 en Cuba: 1241 nuevos casos pos...,es,0.0,0.0,0.0,"['reportando', 'cuba']",[],False,0.0,
1,1.38588e+18,2021-04-24 08:43:17 UTC,4/24/2021,8:43:17,0,,The latest The Zika Advice Paper! https://t.c...,en,0.0,0.0,0.0,"['covid19', 'amr']",[],False,0.0,
2,1.38588e+18,2021-04-24 08:43:16 UTC,4/24/2021,8:43:16,0,,Tum karo toh mantra ..woh kare toh tantra ..ai...,tl,0.0,0.0,0.0,"['covidindia', 'covidvaccine', 'covidresources...",[],False,1.0,https://pbs.twimg.com/media/EzufwnkUYA0TExx.jpg
3,1.38588e+18,2021-04-24 08:43:16 UTC,4/24/2021,8:43:16,0,,https://t.co/4rdhSH3IYl Prime Minister @Nare...,en,0.0,0.0,0.0,['covid_19'],[],False,1.0,https://pbs.twimg.com/media/EzueSRxVgAEiTie.jpg
4,1.38588e+18,2021-04-24 08:43:16 UTC,4/24/2021,8:43:16,0,,@bc_pt64 @KackCake @sherlockine1 @SternchenJvB...,de,0.0,0.0,0.0,[],[],False,0.0,


In [4]:
#let's look at the dataset for Tweets in English
eng_data = dataset.loc[dataset['language'] == 'en']
print("Shape of English dataset:", eng_data.shape)
eng_data.head()

Shape of English dataset: (412115, 16)


Unnamed: 0,id,created_at,date,time,timezone,place,tweet,language,replies_count,retweets_count,likes_count,hashtags,cashtags,retweet,video,thumbnail
1,1.38588e+18,2021-04-24 08:43:17 UTC,4/24/2021,8:43:17,0,,The latest The Zika Advice Paper! https://t.c...,en,0.0,0.0,0.0,"['covid19', 'amr']",[],False,0.0,
3,1.38588e+18,2021-04-24 08:43:16 UTC,4/24/2021,8:43:16,0,,https://t.co/4rdhSH3IYl Prime Minister @Nare...,en,0.0,0.0,0.0,['covid_19'],[],False,1.0,https://pbs.twimg.com/media/EzueSRxVgAEiTie.jpg
5,1.38588e+18,2021-04-24 08:43:16 UTC,4/24/2021,8:43:16,0,,Covid-19: India is going through very terrible...,en,0.0,0.0,0.0,"['presssangharsh', 'dailynews', 'news', 'india...",[],False,1.0,https://pbs.twimg.com/media/EzufxgJXsAM--Iv.png
6,1.38588e+18,2021-04-24 08:43:15 UTC,4/24/2021,8:43:15,0,,@CPBlr @KamalPantIPS speaks to me on the rules...,en,0.0,0.0,0.0,['covid19'],[],False,0.0,
7,1.38588e+18,2021-04-24 08:43:14 UTC,4/24/2021,8:43:14,0,,@Physio_voice @BiswabhusanHC @ysjagan @Audimul...,en,0.0,0.0,0.0,[],[],False,0.0,


In [5]:
eng_data.dtypes

id                float64
created_at         object
date               object
time               object
timezone            int64
place              object
tweet              object
language           object
replies_count     float64
retweets_count    float64
likes_count       float64
hashtags           object
cashtags           object
retweet            object
video             float64
thumbnail          object
dtype: object

In [6]:
eng_data.isnull().sum()

id                     0
created_at             0
date                   0
time                   0
timezone               0
place             411589
tweet                  0
language               0
replies_count          0
retweets_count         0
likes_count            0
hashtags               0
cashtags               0
retweet                0
video                  0
thumbnail         306591
dtype: int64

Features that may not be useful for training:  
`created_at` - full timestamp for creation in UTC (universal coordinated time), can be dropped and the time feature will be labeled as UTC  
`timezone` - all values here are 0  
`place` - almost all values are null  
`language` - currently only training on the English subset  
`video` - not useful for text data  
`thumbnail` - likely related to video, and almost all are null  

In [7]:
eng_data = eng_data.drop(columns=['created_at','timezone', 'place', 'language', 'video', 'thumbnail'], axis=1)
eng_data.rename(columns={'time':'time_utc'}, inplace=True)

In [8]:
eng_data.head()

Unnamed: 0,id,date,time_utc,tweet,replies_count,retweets_count,likes_count,hashtags,cashtags,retweet
1,1.38588e+18,4/24/2021,8:43:17,The latest The Zika Advice Paper! https://t.c...,0.0,0.0,0.0,"['covid19', 'amr']",[],False
3,1.38588e+18,4/24/2021,8:43:16,https://t.co/4rdhSH3IYl Prime Minister @Nare...,0.0,0.0,0.0,['covid_19'],[],False
5,1.38588e+18,4/24/2021,8:43:16,Covid-19: India is going through very terrible...,0.0,0.0,0.0,"['presssangharsh', 'dailynews', 'news', 'india...",[],False
6,1.38588e+18,4/24/2021,8:43:15,@CPBlr @KamalPantIPS speaks to me on the rules...,0.0,0.0,0.0,['covid19'],[],False
7,1.38588e+18,4/24/2021,8:43:14,@Physio_voice @BiswabhusanHC @ysjagan @Audimul...,0.0,0.0,0.0,[],[],False


In [11]:
eng_data.id.describe()

count    4.121150e+05
mean     1.385317e+18
std      3.276902e+14
min      1.384720e+18
25%      1.385030e+18
50%      1.385310e+18
75%      1.385610e+18
max      1.385880e+18
Name: id, dtype: float64

In [15]:
#a function for creating part of speech tags

def pos_tag(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    if treebank_tag.startswith('V'):
        return wordnet.VERB
    if treebank_tag.startswith('N'):
        return wordnet.NOUN
    if treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

    
#a function for lowercasing
def lowercase(doc):
    return [word.lower() for word in doc]

#function for stripping punctuation
def strip(doc):
    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    return regex_token.tokenize(doc)

#function for removing stopwords
def remove_sw(doc):
    eng_sw = stopwords.words('english')
    return [word for word in doc if word not in eng_sw]

In [16]:
practice_run = eng_data.tweet.apply(lambda x: lowercase(x))

# Visualization

# Modeling

# Results

# Explanatory Data Analysis

# Final Model, Conclusion, Recommendations