## Load data and filter

In [None]:
import pandas as pd

In [120]:
df = pd.read_csv('./data.csv')
df.shape

(15924, 12)

In [None]:
df = df[df.Favorites > 3]
df = df.drop_duplicates(subset = 'Text', ignore_index = True)
df.shape

## Cleaning data and removing stopwords

In [67]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
english_stopwords = set(stopwords.words("english"))
from nltk.corpus import stopwords 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




True

In [121]:
def clean_text(text):
  text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', text) # Remove URLs
  text = re.sub('@[^\s]+', '', text) # Remove usernames
  text = re.sub(r'#([^\s]+)', r'\1', text) # Remove the # in #hashtag
  text = re.sub(r'[^A-Za-z]+', ' ', text) # Remove special characters and numbers
  text = re.sub(r'rt|fb|nflx|goog|googl|axp|aapl', '', text, flags = re.I) # Remove tickers
  text = re.sub(r'\b[a-zA-Z]\b', '', text) # Remove "single-worders"
  text = re.sub(r' [ ]+', ' ', text) # Remove extra whitespaces
  text = text.lower() # Convert text to lower-case
  return text

In [35]:
def removestopwords(text):
  tokens = nltk.word_tokenize(text, 'english')
  filtered_tokens = [i for i in tokens if i not in english_stopwords]
  text = ' '.join(filtered_tokens)
  return text

In [122]:
df['Text_clean'] = df['Text'].map(lambda x: clean_text(x))
df['Text_clean'] = df['Text_clean'].map(lambda x: remove_stopwords(x))

In [131]:
df = df.dropna(subset = ['Text_clean'])
df = df[(df.Text_clean != '')]
df.shape

(15863, 13)

In [91]:
# from nltk.stem import PorterStemmer
# st = PorterStemmer()
# df['Text_clean'] = df['Text_clean'].apply(lambda x: ' '.join([st.stem(word) for word in x.split()]))

## Sentiment analysis

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

In [133]:
sid = SentimentIntensityAnalyzer()
df['Sentiment_score'] = df.apply(lambda row: sid.polarity_scores(row['Text_clean'])['compound'], axis = 1)
df['Sentiment'] = df.apply(lambda row: 1 if row['Sentiment_score'] > 0 else 0, axis = 1)
sum(df.Sentiment)

8429

## Export cleaned data

In [134]:
df = df[['Text_clean', 'Sentiment']]
df.rename(columns = {'Text_clean': 'Text'}, inplace = True)

In [135]:
df.head(15)

Unnamed: 0,Text,Sentiment
0,overwhelmingly positive reviews grab friend ge...,1
1,moved ahead hold still like cha though,1
2,lenovo legion chance rise rest unmatched perfo...,0
3,list means jpm gs trv csco msft mcd hd cvx xom...,0
4,expose penny stock scams learn spot twtr,0
5,holidays dressed hebrew santa gave away love twtr,1
6,investors worry big fat list risk factors djia...,0
7,million yachts look like,1
8,cookies cream counteops custom cabinets brass ...,1
9,weekly coil earnings january,0


In [136]:
df.to_csv('clean_data.csv', index = False)