In [1]:
%sh
pip install --upgrade pip

In [2]:
%sh pip install plotly
pip install datetime

In [3]:
from pyspark.sql.functions import col, udf, monotonically_increasing_id
from pyspark.sql.types import IntegerType, TimestampType, StringType, BooleanType, StructType, StructField
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
import plotly.graph_objects as go
import plotly
import datetime

In [4]:
# File location and type
file_location = "/FileStore/tables/train_project-4.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

#Displaying top 5 rows:
display(df.limit(5))

archived,author,author_cakeday,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_text_color,author_flair_type,brand_safe,can_gild,contest_mode,created_utc,distinguished,domain,edited,gilded,hidden,hide_score,id,is_crosspostable,is_reddit_media_domain,is_self,is_video,link_flair_css_class,link_flair_richtext,link_flair_text,link_flair_text_color,link_flair_type,locked,media,no_follow,num_comments,num_crossposts,over_18,parent_whitelist_status,permalink,post_hint,preview,retrieved_on,rte_mode,score,secure_media,selftext,send_replies,spoiler,stickied,subreddit,subreddit_id,subreddit_name_prefixed,subreddit_type,suggested_sort,thumbnail,thumbnail_height,thumbnail_width,title,url,whitelist_status
True,codepoet,,,,[],,,text,True,True,False,1141171234,,macgeekery.com,False,0,False,False,2icw,True,False,False,False,,[],,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/2icw/well_that_was_a_bust/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,Well That Was a Bust,http://www.macgeekery.com/opinion/well_that_was_a_bust,all_ads
True,scylla,,,,[],,,text,True,True,False,1141171723,,msnbc.msn.com,False,0,False,False,2idn,True,False,False,False,,[],,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/2idn/holocaust_why_david_irving_shouldnt_be_jailed_and/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,Holocaust: Why David Irving shouldn’t be jailed ( and it's not because he's in any ways right ),http://www.msnbc.msn.com/id/11569497/site/newsweek/,all_ads
True,tilto,,,,[],,,text,True,True,False,1141171939,,iht.com,False,0,False,False,2ie4,True,False,False,False,,[],,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/2ie4/google_shares_fall_sharply_as_cfo_announces/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,Google shares fall sharply as CFO announces growth is slowing,http://www.iht.com/articles/2006/02/28/business/google.php,all_ads
True,Laibcoms,,,,[],,,text,True,True,False,1141172196,,gameshogun.info,False,0,False,False,2iek,True,False,False,False,,[],,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/2iek/newsvine_launching_tomorrow/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,newsvine launching tomorrow!,http://gameshogun.info/index.php/Tech/2006/03/01/newsvine_launching_tomorrow,all_ads
True,FaeLLe,,,,[],,,text,True,True,False,1141172277,,faelle.com,False,0,False,False,2ies,True,False,False,False,,[],,dark,text,False,,True,0,0,False,all_ads,/r/reddit.com/comments/2ies/voodoopc_to_launch_8tb_media_pc/,,,,markdown,0,,,True,False,False,reddit.com,t5_6,r/reddit.com,archived,,default,,,VoodooPC to launch 8TB Media PC,http://www.FaeLLe.com/2006/03/voodoopc-plans-8tb-media-pc.html,all_ads


In [5]:
print("Number of sample is in train dataset:", df.count(), 
     "\nNumber of columns/variables:", len(df.columns))

Our target variable is - 'score' in the dataframe. Let's have a look at its nature:

In [7]:
#Analyze the target variable
df.select('score').describe().toPandas()

Unnamed: 0,summary,score
0,count,12524
1,mean,7.5152967489416085
2,stddev,29.11053033813615
3,min,"""""url"""":""""https://i.redditmedia.com/2GE_8-hy9X..."
4,max,99


There few null values and some urls that need to be removed from the dataset

In [9]:
def data_cleaning(df):
  
    df = df.filter(~df.score.contains('""url"":""https:'))
    df = df.filter(df.score.isNotNull())
    print("Number of Sample after dropping Null Score values",df.count())    
    bol_col = ['brand_safe', 'can_gild', 'is_crosspostable', 'no_follow', 'over_18']
    #casting each of these columne into string to change that can be
    for col_name in bol_col:
      df = df.withColumn(col_name, col(col_name).cast("string"))
    
    return df

In [10]:
#dropping rows that have string "url..https" in te score column and null values
df = df.filter(~df.score.contains('""url"":""https:'))
df = df.filter(df.score.isNotNull())
print("Number of Sample after dropping Null Score values",df.count())

In [11]:
'''assign target to a separate dataframe
cast the target to integer
drop from the original dataframe'''
target = df.select('score')
target = df.withColumn("score", col("score").cast("integer"))


In [12]:
#storing score in Pandas only for visualizations
score = target.toPandas()['score']

In [13]:
data = go.Histogram(x = score, nbinsx = 20)
layout = go.Layout(title = {'text':'Distribution of scores across the dataset', 'x':0.5,'xanchor':'center', 'yanchor':'top'}, xaxis = dict(title = 'Score'),
                  yaxis = dict(title = 'Count'), width = 500)
go.Figure(data, layout = layout)

From the histogram above it is evident that the number of posts with score greater than 100 are very less

100 to 200 => 190 posts 200 to 300 => 46 posts 300 to 400 => 16 posts 400 to 500 => 1 post 500 to 600 => 3 posts

Therefore, it has to be kept in mind to divide the dataset such that we have each kind of score in train and validation set

In [15]:
df.printSchema()

Columns are either string type, boolean or integer type.
Seaprating these columns for better comprehension of dataset

In [17]:
#string columns
str_col = [var.name for var in df.schema.fields if isinstance(var.dataType,StringType)]

#boolean columns
bol_col = [var.name for var in df.schema.fields if isinstance(var.dataType,BooleanType)]

#integer columns
int_col = [var.name for var in df.schema.fields if isinstance(var.dataType, IntegerType)]

print("Check: No column is left out of the 3 lists: ", len(str_col)+len(bol_col)+len(int_col)==len(df.columns))

In [18]:
def distinct_values(col_list, df):
  distinct_values = {}
  for col_name in col_list:
    distinct_values[col_name] = df.select(col_name).distinct().collect()
    
  return distinct_values

In [19]:
#checking if the column sof boolean datatype have both True and False, that it can be used as a feature
bol_dist_val = distinct_values(bol_col, df)
bol_dist_val

brand_safe, can_gild, is_crosspostable, no_follow, over_18 for prediction

In [21]:
bol_col = ['brand_safe', 'can_gild', 'is_crosspostable', 'no_follow', 'over_18']
#casting each of these columne into string to change that can be 
for col_name in bol_col:
  df = df.withColumn(col_name, col(col_name).cast("string"))

In [22]:
str_col

In [23]:
str_col = ['subreddit_type']

In [24]:
str_dist_val = distinct_values(str_col,df)
str_dist_val

In [25]:
cat_col = bol_col + str_col
int_col = ['num_comments']

In [27]:
%run /Users/shimona.narang@mail.utoronto.ca/feature_engineering

In [28]:
input_df = df
data_to_model = features(df, cat_col, int_col)

In [29]:

trainingData = data_to_model.toDF(*['features', 'label'])

In [30]:
%run /Users/shimona.narang@mail.utoronto.ca/test_data


In [31]:
testData = testData.toDF(*['features', 'label'])