In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pyspark

In [None]:
import pyspark

In [None]:
# Importing SparkSession
from pyspark.sql import SparkSession

In [None]:
# Starting a pyspark session #

spark = SparkSession.builder.appName('test').getOrCreate()

In [None]:
spark
# Only 1 cluster is created
# When we work in the cloud, pyspark creates multiple clusters for parallel processing the data, which is called as the distributed computing.
# Spark session is started

In [None]:
# Reading the data from train.csv (file size: 5.45 GB)
import time
t1 = time.time()
df = spark.read.csv('../input/riiid-test-answer-prediction/train.csv')
t2 =time.time()
diff = t2-t1
print(diff)

# Able to read the 5.45 GB data in just 7 seconds
# Size of the data : 100 Mn Records

In [None]:
df.count()

In [None]:
#Pandas is very efficient with small data (usually from 100MB up to 1GB) 
# When reading this huge dataset through pandas, it throws an out of memory error
import time
import pandas as pd
t1 = time.time()
df1 = pd.read_csv('../input/riiid-test-answer-prediction/train.csv',chunksize=1000000)
t2 = time.time()
diff = t2-t1
print(diff)

In [None]:
df.head()
# It gives the output in the form of column names : c0,c1, c2, c3, c4..

In [None]:
# While reading the dataset, we can provide header as true, to bring the column names instead of c1,c2, c3...
df1 = spark.read.option('header','true').csv('../input/riiid-test-answer-prediction/train.csv')

In [None]:
# Just like info provides datatype of all column in pandas 
# In case of pyspark we use printSchema to get datatype of the columns
df.printSchema()
# By default it is taking the string as a datatype for all columns 

In [None]:
# While reading the dataset, we can provide the schema for all variables, so it takes the appropriate datatypes for all columns and putting header=True to get column names
df1 = spark.read.csv('../input/riiid-test-answer-prediction/train.csv',header=True,inferSchema=True)

In [None]:
# It provides the actual data type of all columns #
df1.printSchema()

In [None]:
# dtypes als helps to check the datatypes of all columns
df1.dtypes

In [None]:
# top 2 records
df1.head(2)

In [None]:
# Column names #
df1.columns

In [None]:
# Get the top 6 records of a specific column
df1.select('user_id').show(6)

In [None]:
# Picking multiple columns at a time

df1.select(['user_id','user_answer']).show(3)

In [None]:
# Summary of all the variables - describe 
df1.describe().show()

In [None]:
# Check the distinct categories in a column #

from pyspark.sql.functions import countDistinct
c1 = df1.select(countDistinct('user_answer'))
c1.show()
# 5 unique categories

In [None]:
# Making the value counts for each of the categories in the column : user_answer
c2=df1.groupby('user_answer').count()
c2.show()


In [None]:
# Creating a new column based on the user answers, if a person has provided answer has 2, lets call it as True, elee as False
df1=df1.withColumn('user_answer_flag',df1['user_answer']>1)
df1.select(['user_answer','user_answer_flag']).show(5)

In [None]:
# Dropping the columns # ( Drop column : user_answer_flag)

df1 = df1.drop('user_answer_flag')
df1.columns

In [None]:
# Renaming the columns #
df1 = df1.withColumnRenamed('user_id','User Id')
df1.columns

In [None]:
# To get the shape of the dataframe #

print(df1.count(),len(df1.columns))

In [None]:
# Check the  missing values in dataframe
from pyspark.sql.functions import isnan, when, count, col
df1.select([count(when(col(c).isNull(), c)).alias(c) for c in df1.columns]).show()
# There are missing values : prior_question_elapsed_time and prior_question_had_explanation

In [None]:
# Parameters in dropping the null records #
# how ='any' (default), 'all'
# subset : we can drop the records if there are null values in a specific column
# thresh : for eg thres 2, it will keep those records which have atleast 2 non null values

In [None]:
# Dropping the rows of the dataframe if there is any missing value in any column of the dataframe
df1.na.drop().count()
# By default , how='any', if it founds any null values in any column, it will drop those rows

In [None]:
df1.na.drop(how='all').count()
# If how='all', if it founds all the columns as null for a record, then only it will drop that record

In [None]:
df1.count()

In [None]:
# Handling missing values #
# Wherever there is a missing value, it will fill missing values with 'Missing'

df.na.fill('Missing').show(2)

In [None]:
df1.show(2)

In [None]:
# Filling missing values with mean value #

# We have an imputer function which will calcutate the mean of the entire column, which can be used to impute the missing values

from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=['prior_question_elapsed_time'],outputCols=["{}_imputed".format(c) for c in ['prior_question_elapsed_time']]).setStrategy('mean')

In [None]:
imputer.fit(df1).transform(df1).show(2)

# We have created a new column : prior_question_elapsed_time_imputed which has imputed the missing values with the mean values.
# Futher we can drop the prior_question_elapsed_time column which has missing values

In [None]:
# Filtering Operations on the dataframe to fetch the records of dataframe #
# Filter the records with user_answer>1 
df1.filter('user_answer>1').show(2)

In [None]:
# Different way of filtering the same data
df1.filter(df1['user_answer']>1).show(2)

In [None]:
# After filtering showing only 2 columns 
df1.filter('user_answer>1').select(['user_answer','answered_correctly']).show(2)

In [None]:
# Filtering operations using 'and' #
# User Answer : 2
df1.filter((df1['user_answer']>1) & (df1['user_answer']<3)).show(4)

In [None]:
# Using the Or (|) operators

df1.filter((df1['user_answer']==1) | (df1['user_answer']==3)).show(4)

In [None]:
# Not Operator (~) :Not having 1
df1.filter(~(df1['user_answer']==1)).show(4)