# Install PySpark

In [None]:
#Install Pyspark
!pip install pyspark

# Import Libraries

In [None]:
from datetime import datetime
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
import pyspark.pandas as ps
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession,SQLContext
from pyspark import SparkContext
from pyspark import SparkConf
import pyspark.pandas as ps
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline

# Initiate Spark

In [None]:
spark = (
    SparkSession
    .builder
    .appName("Ubiquant EDA")
    .config("spark.sql.execution.arrow.pyspark.enabled", True)
    .config("spark.executor.memory", "5g")
    .config("spark.driver.maxResultSize","40g")
    .getOrCreate())

# Import Dataset

In [None]:
start_time = datetime.now()
df=spark.read.csv("/kaggle/input/ubiquant-market-prediction/train.csv", inferSchema=True, header=True)
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

In [None]:
print("Number of rows=>",df.count(),"\nNumber of columns=>",len(df.columns))

# Null Check

3.1 mn rows has taken less than 5 min to read in spark

In [None]:
#Check for number of nulls in each column
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

# Datatype of Columns

**Finding:** No nulls found in the dataset

In [None]:
#Data type of each column
df.dtypes

# Conversion of Spakdataframe to View for easy SQL

In [None]:
#Convert spark dataframe to View
df.createTempView("train")

# Minimum and Maximum Time_id

In [None]:
#Minimum and Maximum Time id
spark.sql("select min(time_id) as min_time_id,max(time_id) as max_time_id from train ").show()

# Time Id Frequency

In [None]:
fig = px.bar(spark.sql("select time_id,count(*) as count from train group by time_id ").toPandas().sort_values(by="time_id"), x="time_id", y="count" ,title='Number of records from each time_id')
fig.show()

**Finding:**No time_ids in the range 361,368 to 372,382

In [None]:
#Top 20 investment ids present most of the time 
spark.sql("""select investment_id,count(*) as count from train group by investment_id order by count desc """).show()

# Time ID vs Avg Return

In [None]:
#Average return for each time_id
fig = px.line(spark.sql("select time_id,avg(target) as avg_return from train group by time_id  ").toPandas().sort_values(by="time_id"), x="time_id", y="avg_return" ,title='Time_id vs Avg Return')
fig.show()

In [None]:
#Exclude columns 'row_id','time_id','investment_id'
excl_cols=['row_id','time_id','investment_id']
cols = [col for col in df.columns if col not in excl_cols]

In [None]:
#Convert to pandas on spark
pdsprk_df=df.to_pandas_on_spark()

# Correlation Plot

In [None]:
corr = pdsprk_df[cols].corr()
corr.style.background_gradient(cmap='Blues',axis=0)

# PySpark Machine Learning

In [None]:
#Remove the columns containing ids
traindf=df.drop('time_id','investment_id','row_id')

In [None]:
#Feature preprocessing
feature_list = []
for col in traindf.columns:
    if col == 'target':
        continue
    else:
        feature_list.append(col)
assembler = VectorAssembler(inputCols=feature_list, outputCol="features")

In [None]:
#Assign Random Forest Regressor
rf = RandomForestRegressor(labelCol="target", featuresCol="features")

In [None]:
#Build pipeline
pipeline = Pipeline(stages=[assembler, rf])

In [None]:
#Train the model

rf_model=pipeline.fit(traindf)

In [None]:
#Feature Importance

va = rf_model.stages[-2]
tree = rf_model.stages[-1]
zipped=zip(va.getInputCols(), tree.featureImportances)
fea=sorted(zipped, key=lambda x: x[1],reverse=True)
feature_importance = pd.DataFrame(fea, columns =['features', 'importance'])

#Store the columns having importance>0
imp_cols=[]
for i in range(0,len(feature_importance)):
    #print(feature_importance.features[i])
    if feature_importance['importance'][i]>0:
        imp_cols.append(feature_importance.features[i])

# Top 10 features based on Base Random Forest Regressor Model

In [None]:
feature_importance.head(10)

In [None]:
#Stop Spark Session
spark.stop()