In [22]:
#Import the required libraries

import pandas as pd
import numpy as np
# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns
# For regular expressions
import re
# For handling string
import string
# For performing mathematical operations
import math
import matplotlib.pyplot as plt
# For missing values
import missingno as msg
#For datetime
import datetime
# For handling warnings
import warnings
warnings.filterwarnings("ignore")

In [33]:
# read the cleaned dataset

df=pd.read_csv("clean_reddit_01_2015.csv",encoding='utf-8')
df.head()

Unnamed: 0,body,downs,created_utc,score,author,distinguished,archived,subreddit,author_flair_css_class,author_flair_text,gilded,ups,controversiality,edited
0,Most of us have some family members like this....,0,1420070400,14,YoungModern,,False,exmormon,,,0,14,0,False
1,But Mill's career was way better. Bentham is l...,0,1420070400,3,RedCoatsForever,,False,CanadaPolitics,on,Ontario,0,3,0,False
2,"Mine uses a strait razor, and as much as i lov...",0,1420070400,1,vhisic,,False,AdviceAnimals,,,0,1,0,False
3,"Very fast, thank you!",0,1420070400,2,Mastersimpson,,False,freedonuts,,,0,2,0,False
4,"The guy is a professional, and very good at wh...",0,1420070400,6,BigGupp1,,False,WTF,,,0,6,0,False


### Remove corrupt rows 

In [34]:
def is_number(num):
    if pd.isna(num):
        return True
    try:
        float(num)
        return True
    except ValueError:
        return False

def is_integer(num):
    if pd.isna(num):
        return True
    try:
        int(num)
        return True
    except ValueError:
        return False

def valid_name(name):
    name_regex = re.compile(r"\A[A-Za-z0-9][A-Za-z0-9_-]{1,20}\Z")
    return bool(name_regex.match(name)) or pd.isna(name)

def valid_body(body):
    return len(body.strip()) > 0 or pd.isna(body)

def is_boolean(boo):
    return (str(boo) in ['True', 'False']) or pd.isna(boo)

def valid_controversiality(controversiality):
    return (is_number(controversiality) and float(controversiality) <= 1 and float(controversiality) >= 0) or pd.isna(controversiality)

def valid_utc(utc):
    return is_number(utc) and (len(str(utc)) == 12 or len(str(utc)) == 10) or pd.isna(utc)

def valid_distinguished(distinguished):
    return (str(distinguished) in ['nan', 'moderator', 'admin', 'special']) or pd.isna(distinguished)

def valid_subreddit_type(subreddit_type):
    return str(subreddit_type) in ['public', 'restricted', 'user'] or pd.isna(subreddit_type)

In [35]:
def find_corrupt_rows(df):
    corrupt_rows = []
    for index, row in df.iterrows():
        
        is_invalid_feature_info = {'author': False, 'author_flair_css_class': False, 'author_flair_text': False, 'body': False,
       'controversiality': False, 'created_utc': False, 'distinguished': False,
       'edited': False, 'gilded': False, 'score': False,'subreddit': False}
        try:
            if not valid_name(row['author']):
                is_invalid_feature_info['author'] = True
            if not valid_body(row['body']):
                is_invalid_feature_info['body'] = True
            if not valid_controversiality(row['controversiality']):
                is_invalid_feature_info['controversiality'] = True
            if not valid_utc(row['created_utc']):
                is_invalid_feature_info['created_utc'] = True
            if not valid_distinguished(row['distinguished']):
                is_invalid_feature_info['distinguished'] = True
            if not is_boolean(row['edited']):
                is_invalid_feature_info['edited'] = True
            if not is_integer(row['gilded']):
                is_invalid_feature_info['gilded'] = True
            if not is_integer(row['score']): 
                is_invalid_feature_info['score'] = True
            if not valid_name(row['subreddit']): 
                is_invalid_feature_info['subreddit'] = True
            if any(list(is_invalid_feature_info.values())):
                corrupt_row_info = (index, is_invalid_feature_info)
                corrupt_rows.append(corrupt_row_info)
        except:
            corrupt_row_info = (index, is_invalid_feature_info)
            corrupt_rows.append(corrupt_row_info)
    return corrupt_rows

In [36]:
def get_valid_and_corrupt_df(df):
    corrupt_row_indices = find_corrupt_rows(df)
    print("Number of corrupt rows: ", len(corrupt_row_indices), "Number of valid rows: ", str(len(df)-len(corrupt_row_indices)))
    indices_to_drop, errors = zip(*corrupt_row_indices)
    indices_to_drop = [int(i) for i in indices_to_drop]
    corrupt_rows = df.iloc[indices_to_drop,:]
    valid_rows = df.copy()
    valid_rows = valid_rows.drop(df.index[indices_to_drop])
    return (valid_rows, corrupt_rows)

In [37]:
valid_train, invalid_train = get_valid_and_corrupt_df(df)

Number of corrupt rows:  8704 Number of valid rows:  178525


In [38]:
def enforce_data_types(df):
    original_data_types = {'author': str, 'author_flair_css_class': str, 'author_flair_text': str, 'body': str,
              'controversiality': float, 'created_utc': int, 'distinguished': str, 'edited': bool, 'gilded': int,
                           'score': int, 'subreddit': str}
    return df.astype(original_data_types)

In [39]:
def fill_in_missing_values(df):
    avg_created_utc = np.mean([time for time in df['created_utc'] if not pd.isna(time)])
    avg_controversiality = np.mean([contro for contro in df['controversiality'] if not pd.isna(contro)])
    replacements = {'author':'', 'author_flair_css_class':'', 'author_flair_text':'', 'body':'',
                     'controversiality': avg_controversiality, 'created_utc':avg_created_utc,
                    'distinguished':False, 'edited':False, 'gilded':False,
                    'score':0, 'subreddit':''}
    return df.fillna(value=replacements)

In [40]:
def clean(df):
    df = enforce_data_types(df)
    print("Data types: ", df.dtypes)
    df = fill_in_missing_values(df)
    print("Columns with empty values: ", df.columns[df.isna().any()].tolist())
    return df

In [41]:
clean_valid_train = clean(valid_train.copy())

Data types:  body                       object
downs                       int64
created_utc                 int32
score                       int32
author                     object
distinguished              object
archived                     bool
subreddit                  object
author_flair_css_class     object
author_flair_text          object
gilded                      int32
ups                         int64
controversiality          float64
edited                       bool
dtype: object
Columns with empty values:  []


In [42]:
#return the wordnet object value corresponding to the POS tag
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = str(text).lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
clean_valid_train["body_clean"] = clean_valid_train["body"].apply(lambda x: clean_text(x))
clean_valid_train=clean_valid_train.drop(columns=['body'])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Sentiment analysis

In [45]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
#nltk.download('vader_lexicon')


def analyze_sentiments(df):
    sentiment_analyzer = SentimentIntensityAnalyzer()
    positive = []
    neutral = []
    negative = []
    compound = []
    for text in df['body_clean']:
        sentiment = sentiment_analyzer.polarity_scores(str(text))
        positive.append(sentiment['pos'])
        neutral.append(sentiment['neu'])
        negative.append(sentiment['neg'])
        compound.append(sentiment['compound'])
        if len(compound) % 500000 == 0:
            print(len(compound), "/", len(df['body_clean']))
    df['positive_sentiment'] = pd.Series(positive)
    df['neutral_sentiment'] = pd.Series(neutral)
    df['negative_sentiment'] = pd.Series(negative)
    df['compound_sentiment'] = pd.Series(compound)
    return df

In [46]:
engineered_train = analyze_sentiments(clean_valid_train)

In [47]:
#!pip install wordsegment
import wordsegment as ws
from wordsegment import segment
ws.load()
print(segment(ws.clean("workswithotherlanguages")))

['works', 'with', 'other', 'languages']


In [48]:
from wordsegment import load, segment
def segmentSubreddits(frame):
    subreddits=frame['subreddit']
    finalsb=[]
    ws.load()
    for i, sb in zip(range(len(subreddits)), subreddits):
        if i%25000 == 0:
            print(i/25000)
        if len(sb) != 0:
            segmented=segment(ws.clean(sb))
            stringsb=""
            for c in segmented:
                stringsb+=c+" "
            finalsb.append(stringsb[:-1])
        else:
            finalsb.append(sb)
    return finalsb

subreddits_fixed=pd.DataFrame(segmentSubreddits(engineered_train))
subreddits_fixed.head()

0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0


Unnamed: 0,0
0,ex mormon
1,canada politics
2,advice animals
3,free donuts
4,wtf


In [49]:
# add number of characters column
engineered_train["nb_chars"] = engineered_train["body_clean"].apply(lambda x: len(str(x)))

# add number of words column
engineered_train["nb_words"] = engineered_train["body_clean"].apply(lambda x: len(str(x).split(" ")))

In [50]:
engineered_train.head()

Unnamed: 0,downs,created_utc,score,author,distinguished,archived,subreddit,author_flair_css_class,author_flair_text,gilded,ups,controversiality,edited,body_clean,positive_sentiment,neutral_sentiment,negative_sentiment,compound_sentiment,nb_chars,nb_words
0,0,1420070400,14,YoungModern,,False,exmormon,,,0,14,0.0,True,family member like family like,0.625,0.375,0.0,0.6124,30,5
1,0,1420070400,3,RedCoatsForever,,False,CanadaPolitics,on,Ontario,0,3,0.0,True,mill's career way well bentham like joseph smi...,0.338,0.662,0.0,0.5574,69,11
2,0,1420070400,1,vhisic,,False,AdviceAnimals,,,0,1,0.0,True,mine use strait razor much love clipper love r...,0.363,0.563,0.074,0.8481,106,20
3,0,1420070400,2,Mastersimpson,,False,freedonuts,,,0,2,0.0,True,fast thank,0.714,0.286,0.0,0.3612,10,2
4,0,1420070400,6,BigGupp1,,False,WTF,,,0,6,0.0,True,guy professional good highly doubt miss often,0.251,0.346,0.404,-0.1953,45,7


In [51]:
engineered_train.shape

(178525, 20)

In [52]:
engineered_train.to_csv("engineered_train.csv", encoding='utf-8', index=False)