Modelo para anlisis de sentmientos para luego precedir si tiene sintomas de depresion.

In [1]:
from string import punctuation
import pandas as pd
import nltk
import re

In [2]:
# read the data from tweets_public.csv and create a dataframe
df = pd.read_csv('data/tweets_public.csv', encoding='utf-8')
df.head()

Unnamed: 0,airline_sentiment,is_reply,reply_count,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,neutral,False,0,0,Trabajar en #Ryanair como #TMA: https://t.co/r...,,Fri Nov 03 12:05:12 +0000 2017,926419989107798016,,Madrid
1,neutral,True,0,0,@Iberia @FIONAFERRER Cuando gusten en Cancún s...,,Sun Nov 26 18:40:28 +0000 2017,934854385577943041,,Mexico City
2,negative,False,0,0,Sabiais que @Iberia te trata muy bien en santi...,,Mon Dec 25 15:40:45 +0000 2017,945318406441635840,,Madrid
3,negative,False,0,0,NUNCA NUNCA NUNCA pidáis el café de Ryanair.\n...,,Mon Nov 06 14:18:35 +0000 2017,927540721296568320,,Pacific Time (US & Canada)
4,positive,True,0,0,@cris_tortu @dakar @Iberia @Mitsubishi_ES @BFG...,,Mon Jan 01 23:00:57 +0000 2018,947965901332197376,,Buenos Aires


In [3]:
df.shape

(7867, 10)

# Preprocesamiento
Limpieza de datos

In [4]:
# create a new dataframe with only the text and airline_sentiment columns and tweet id with the name df_sentiment
df_sentiment = df[['airline_sentiment', 'text', 'airline_sentiment', 'tweet_id']]

In [5]:
# transform the text letters to lowercase
df_sentiment['text'] = df_sentiment['text'].str.lower()
df_sentiment.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentiment['text'] = df_sentiment['text'].str.lower()


Unnamed: 0,airline_sentiment,text,airline_sentiment.1,tweet_id
0,neutral,trabajar en #ryanair como #tma: https://t.co/r...,neutral,926419989107798016
1,neutral,@iberia @fionaferrer cuando gusten en cancún s...,neutral,934854385577943041
2,negative,sabiais que @iberia te trata muy bien en santi...,negative,945318406441635840
3,negative,nunca nunca nunca pidáis el café de ryanair.\n...,negative,927540721296568320
4,positive,@cris_tortu @dakar @iberia @mitsubishi_es @bfg...,positive,947965901332197376


In [6]:
# Referencia: https://stackoverflow.com/questions/6718633/python-regular-expression-again-match-url
# remove the urls from the text but keep all the text after the url
df_sentiment.loc[:, 'text'] = df_sentiment['text'].apply(lambda x: re.split('http[s]*\S+', str(x))[0])
df_sentiment.head()

Unnamed: 0,airline_sentiment,text,airline_sentiment.1,tweet_id
0,neutral,trabajar en #ryanair como #tma:,neutral,926419989107798016
1,neutral,@iberia @fionaferrer cuando gusten en cancún s...,neutral,934854385577943041
2,negative,sabiais que @iberia te trata muy bien en santi...,negative,945318406441635840
3,negative,nunca nunca nunca pidáis el café de ryanair.\n...,negative,927540721296568320
4,positive,@cris_tortu @dakar @iberia @mitsubishi_es @bfg...,positive,947965901332197376


In [7]:
# remove the punctuation from the text
df_sentiment.loc[:, 'text'] = df_sentiment['text'].apply(lambda x: ''.join(c for c in x if c not in punctuation))
df_sentiment.head()

Unnamed: 0,airline_sentiment,text,airline_sentiment.1,tweet_id
0,neutral,trabajar en ryanair como tma,neutral,926419989107798016
1,neutral,iberia fionaferrer cuando gusten en cancún se ...,neutral,934854385577943041
2,negative,sabiais que iberia te trata muy bien en santia...,negative,945318406441635840
3,negative,nunca nunca nunca pidáis el café de ryanair\nb...,negative,927540721296568320
4,positive,cristortu dakar iberia mitsubishies bfgoodrich...,positive,947965901332197376


In [8]:
# change the \n to a space
df_sentiment.loc[:, 'text'] = df_sentiment['text'].apply(lambda x: x.replace('\n', ' '))
df_sentiment.head()

Unnamed: 0,airline_sentiment,text,airline_sentiment.1,tweet_id
0,neutral,trabajar en ryanair como tma,neutral,926419989107798016
1,neutral,iberia fionaferrer cuando gusten en cancún se ...,neutral,934854385577943041
2,negative,sabiais que iberia te trata muy bien en santia...,negative,945318406441635840
3,negative,nunca nunca nunca pidáis el café de ryanair bu...,negative,927540721296568320
4,positive,cristortu dakar iberia mitsubishies bfgoodrich...,positive,947965901332197376


In [9]:
# remove the stopwrods from the text
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('spanish'))
df_sentiment.loc[:, 'text'] = df_sentiment['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
df_sentiment.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\raula\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,airline_sentiment,text,airline_sentiment.1,tweet_id
0,neutral,trabajar ryanair tma,neutral,926419989107798016
1,neutral,iberia fionaferrer gusten cancún viaja disfrut...,neutral,934854385577943041
2,negative,sabiais iberia trata bien santiago chilete cam...,negative,945318406441635840
3,negative,nunca nunca nunca pidáis café ryanair bueno ve...,negative,927540721296568320
4,positive,cristortu dakar iberia mitsubishies bfgoodrich...,positive,947965901332197376


In [10]:
# remove the emojis from the text

# regular expression pattern to remove emojis from text
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # faces
        u"\U0001F300-\U0001F5FF"  # simbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        "]+", flags=re.UNICODE)

# use lambda function to remove the emojis from the text
df_sentiment.loc[:, 'text'] = df_sentiment['text'].apply(lambda x: emoji_pattern.sub(r'', x))
df_sentiment.head()

Unnamed: 0,airline_sentiment,text,airline_sentiment.1,tweet_id
0,neutral,trabajar ryanair tma,neutral,926419989107798016
1,neutral,iberia fionaferrer gusten cancún viaja disfrut...,neutral,934854385577943041
2,negative,sabiais iberia trata bien santiago chilete cam...,negative,945318406441635840
3,negative,nunca nunca nunca pidáis café ryanair bueno ve...,negative,927540721296568320
4,positive,cristortu dakar iberia mitsubishies bfgoodrich...,positive,947965901332197376


In [11]:
# remove the numbers from the text
df_sentiment.loc[:, 'text'] = df_sentiment['text'].apply(lambda x: re.sub(r'\d+', '', x))
df_sentiment.head()

Unnamed: 0,airline_sentiment,text,airline_sentiment.1,tweet_id
0,neutral,trabajar ryanair tma,neutral,926419989107798016
1,neutral,iberia fionaferrer gusten cancún viaja disfrut...,neutral,934854385577943041
2,negative,sabiais iberia trata bien santiago chilete cam...,negative,945318406441635840
3,negative,nunca nunca nunca pidáis café ryanair bueno ve...,negative,927540721296568320
4,positive,cristortu dakar iberia mitsubishies bfgoodrich...,positive,947965901332197376
