In [None]:
import pandas as pd
import numpy as np
import contractions
import fasttext
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

In [None]:
df = pd.read_csv('tweet_emotions.csv')
df=df[['sentiment','content']]

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
#Removing contractions, tokenizing and lowering the letters
df['no_contract']=df['content'].apply(lambda x: [contractions.fix(word) for word in x.split()])
df['no_contract']=[' '.join(map(str, l)) for l in df['no_contract']]
df['tokenized'] = df['no_contract'].apply(word_tokenize)
df['lower'] = df['tokenized'].apply(lambda x: [word.lower() for word in x])

In [None]:
#Removing Punctuations
punc = string.punctuation
df['no_punc'] = df['lower'].apply(lambda x: [word for word in x if word not in punc])

In [None]:
#Removing non english words
pretrained_model = "lid.176.bin" 
model = fasttext.load_model(pretrained_model)
langs = []
for sent in df['no_contract']:
    lang = model.predict(sent)[0]
    langs.append(str(lang)[11:13])
df['language']=langs
df.drop(df.loc[df['language']!='en'].index, inplace=True)

In [None]:
#Removing stopwords and adding tags
stop_words = set(stopwords.words('english'))
df['stopwords_removed'] = df['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])

In [None]:
#Adding tags for lemmatization
df['pos_tags'] = df['stopwords_removed'].apply(nltk.tag.pos_tag)
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
df['wordnet_pos'] = df['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])


In [None]:
#Lemmatizing and converting into string
wnl = WordNetLemmatizer()
df['lemmatized'] = df['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
df['lemma_str'] = [' '.join(map(str,l)) for l in df['lemmatized']]

In [None]:
#Saving the file
df = df[['sentiment','lemma_str']]
df.to_csv('/content/drive/My Drive/filename.csv', encoding='utf-8', index=False)