In [1]:
# imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import re
from scipy.stats import chisquare
from wordcloud import WordCloud

import numpy as np
from nltk.stem.lancaster import LancasterStemmer 
from nltk.stem import SnowballStemmer, WordNetLemmatizer, PorterStemmer

wordnet_lemmatizer = WordNetLemmatizer()
snowball_stemmer = SnowballStemmer('english')
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
import nltk


In [2]:
# create generic path using so that the code can run in both windows and linux systems
raw_data_path = os.path.abspath(os.path.join(os.getcwd(),os.path.pardir,'data','raw'))
processed_data_path = os.path.abspath(os.path.join(os.getcwd(),os.path.pardir,'data','processed'))
interim_data_path = os.path.abspath(os.path.join(os.getcwd(),os.path.pardir,'data','interim'))
print(raw_data_path)
print(processed_data_path)
print(interim_data_path)

G:\Git Projects\twitter_product_sentiment\data\raw
G:\Git Projects\twitter_product_sentiment\data\processed
G:\Git Projects\twitter_product_sentiment\data\interim


In [3]:
# read data
df_train = pd.read_csv(os.path.abspath(os.path.join(interim_data_path,'train.csv')))

In [4]:
# read data
df_test = pd.read_csv(os.path.abspath(os.path.join(interim_data_path,'test.csv')))

In [5]:
prefix = 'test'

In [6]:
df_train.columns

Index(['Unnamed: 0', 'tweet_id', 'tweet', 'sentiment', 'charcount',
       'countwords', '@counts', '#counts', 'Capscounts',
       'count_excl_quest_marks', 'count_urls ', 'count_special_chars',
       'Company'],
      dtype='object')

In [7]:
def return_req_columns(df):
    return df[['tweet', 'sentiment', 'charcount','countwords', '@counts', '#counts', 'Capscounts','count_excl_quest_marks', 'count_urls ', 'count_special_chars','Company']]

def return_req_columns_test(df):
    return df[['tweet', 'charcount','countwords', '@counts', '#counts', 'Capscounts','count_excl_quest_marks', 'count_urls ', 'count_special_chars','Company']]

def remove_tags(text, prefix):
    '''remove # tags'''
    text = re.sub(r'#\w+',"",text) # remove the # tags
    prefix+='_rt'
    return [text,prefix]

def remove_mentions(text,prefix):
    '''remove @ mention'''
    text = re.sub(r'@\w+',"",text) # remove the # tags
    prefix+='_rm'
    return text,prefix

def remove_english_stopwords(text,prefix):
    '''Removes english stopwords'''
    from nltk.corpus import stopwords
    stop_words=set(stopwords.words('english'))
    text = ' '.join([i for i in text.split() if i not in stop_words])
    prefix+='_res'
    return text,prefix

def remove_stopwords(text,prefix):
    '''keeps requiresd words from stopwords'''
    from nltk.corpus import stopwords
    stop_words=set(stopwords.words('english'))
    required_words = ['was', 'did', 'but', 'against', 'not', 'no' , 'nor', 'over', 'under', 'again', 'few', 'more', 'most', 'too', 'very', 'couldn',
    'couldn\'t', 'don\'t','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",
    'haven',"haven't",'isn',"isn't",'ma','mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",
    'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't"]
    stop_words = [i for i in stop_words if i not in required_words]
    text = ' '.join([i for i in text.split() if i not in stop_words])
    prefix+='_rs'
    return text,prefix

def remove_lest_than_3(text,prefix):
    '''Removes words with length less than 2'''
    text = ' '.join([i for i in text.split() if len(i)>2])
    prefix+='_rlt3'
    return text,prefix

def apply_lemmatization(text, lemmatizer):
    words = nltk.word_tokenize(text)
    data_post_lemm = ' '.join([lemmatizer.lemmatize(word) for word in words])
    return data_post_lemm


def apply_stemmer(text, stemm):
    words = nltk.word_tokenize(text)
    data_post_stemm = ' '.join([stemm.stem(word) for word in words])
    return data_post_stemm

def stemming_all(df,prefix):    
    df_tweet_clean_wordnet_stemmer=df
    df_tweet_clean_porter_stemmer=df
    df_tweet_clean_snowball_stemmer=df
    df_tweet_clean_lancaster_stemmer=df
    df_tweet_clean_wordnet_stemmer['tweet'] = df.tweet.apply(lambda x : apply_lemmatization(x,wordnet_lemmatizer))
    df_tweet_clean_wordnet_stemmer.to_csv(os.path.abspath(os.path.join(processed_data_path,prefix+'_df_tweet_clean_wordnet_stemmer.csv'))) 
    df_tweet_clean_porter_stemmer['tweet'] = df.tweet.apply(lambda x : apply_stemmer(x,porter_stemmer))
    df_tweet_clean_porter_stemmer.to_csv(os.path.abspath(os.path.join(processed_data_path,prefix+'_df_tweet_clean_porter_stemmer.csv')))
    df_tweet_clean_snowball_stemmer['tweet'] = df.tweet.apply(lambda x : apply_stemmer(x,snowball_stemmer))
    df_tweet_clean_snowball_stemmer.to_csv(os.path.abspath(os.path.join(processed_data_path,prefix+'_df_tweet_clean_snowball_stemmer.csv')))
    df_tweet_clean_lancaster_stemmer['tweet'] = df.tweet.apply(lambda x : apply_stemmer(x,lancaster_stemmer))
    df_tweet_clean_lancaster_stemmer.to_csv(os.path.abspath(os.path.join(processed_data_path,prefix+'_df_tweet_clean_lancaster_stemmer.csv')))
    return df_tweet_clean_wordnet_stemmer,df_tweet_clean_porter_stemmer,df_tweet_clean_snowball_stemmer,df_tweet_clean_lancaster_stemmer


In [8]:

dispatcher = { 'remove_tags' : remove_tags, 'remove_mentions' : remove_mentions, 'remove_english_stopwords': remove_english_stopwords, 'remove_stopwords': remove_stopwords, 'remove_lest_than_3': remove_lest_than_3, 'apply_lemmatization': apply_lemmatization, 'apply_stemmer': apply_stemmer}


def call_func(text, prefix, func):
    try:
        return dispatcher[func](text, prefix)
    except:
        return "Invalid function"



In [9]:
def process_txt(df,prefix):
    functions = ['remove_tags','remove_mentions','remove_english_stopwords','remove_stopwords','remove_lest_than_3','stemming_all']
    for func_name in functions:
        print('\ncalling function',func_name)
        print('\nprefix before calling: ',prefix)
        if func_name == 'stemming_all':
            stemming_all(df,prefix)
        else:
            df.tweet = df.tweet.apply(lambda x: call_func(x, prefix,func_name)[0])
            txt,prefix = call_func(' abcd ',prefix,func_name)
        print('\nprefix after calling: ',prefix)
        df.to_csv(os.path.abspath(os.path.join(processed_data_path,prefix+'df.csv')))

In [10]:
df_train = return_req_columns(df_train)
process_txt(df_train,'train')


calling function remove_tags

prefix before calling:  train

prefix after calling:  train_rt

calling function remove_mentions

prefix before calling:  train_rt

prefix after calling:  train_rt_rm

calling function remove_english_stopwords

prefix before calling:  train_rt_rm

prefix after calling:  train_rt_rm_res

calling function remove_stopwords

prefix before calling:  train_rt_rm_res

prefix after calling:  train_rt_rm_res_rs

calling function remove_lest_than_3

prefix before calling:  train_rt_rm_res_rs

prefix after calling:  train_rt_rm_res_rs_rlt3

calling function stemming_all

prefix before calling:  train_rt_rm_res_rs_rlt3

prefix after calling:  train_rt_rm_res_rs_rlt3


In [11]:
df_test = return_req_columns_test(df_test)
process_txt(df_train,'test')


calling function remove_tags

prefix before calling:  test

prefix after calling:  test_rt

calling function remove_mentions

prefix before calling:  test_rt

prefix after calling:  test_rt_rm

calling function remove_english_stopwords

prefix before calling:  test_rt_rm

prefix after calling:  test_rt_rm_res

calling function remove_stopwords

prefix before calling:  test_rt_rm_res

prefix after calling:  test_rt_rm_res_rs

calling function remove_lest_than_3

prefix before calling:  test_rt_rm_res_rs

prefix after calling:  test_rt_rm_res_rs_rlt3

calling function stemming_all

prefix before calling:  test_rt_rm_res_rs_rlt3

prefix after calling:  test_rt_rm_res_rs_rlt3
