In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import findspark
findspark.init("/home/rajdeep/spark-3.5.0-bin-hadoop3")

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [17]:
#initalizing the spark object
spark = SparkSession.builder.appName("collegeClass").getOrCreate()

In [5]:
#reading the data into dataframe
df = spark.read.csv("data/College.csv", inferSchema=True, header=True)

In [6]:
df.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [7]:
#initializing the VectorAssembler to create feature vector for ML model
assembler = VectorAssembler(inputCols=['Apps','Accept','Enroll','Top10perc','Top25perc','F_Undergrad','P_Undergrad','Outstate',
                'Room_Board','Books','Personal','PhD','Terminal','S_F_Ratio','perc_alumni','Expend','Grad_Rate'], outputCol='feature')
#creating the new column with vector of required row data
df = assembler.transform(df)

In [8]:
df = df.select(['feature','Private'])

In [9]:
indexer = StringIndexer(inputCol='Private', outputCol='privateIndex')
df = indexer.fit(df).transform(df)

In [10]:
# splitting the available data into train - test df
train_df, test_df = df.randomSplit([0.7,0.3])

In [11]:
#initializing the DecisionTreeClassifier, RandomForestClassifier, GBTClassifier object
dt= DecisionTreeClassifier(featuresCol='feature', labelCol='privateIndex')
rf = RandomForestClassifier(featuresCol='feature', labelCol='privateIndex')
gbt = GBTClassifier(featuresCol='feature', labelCol='privateIndex',maxDepth=4)

In [12]:
#fitting the model's on train data
dtmodel = dt.fit(train_df)
rfmodel = rf.fit(train_df)
gbtmodel = gbt.fit(train_df)

                                                                                

In [13]:
# transforming the test data
dt_prediction = dtmodel.transform(test_df)
rf_prediction = rfmodel.transform(test_df)
gbt_prediction = gbtmodel.transform(test_df)

In [14]:
gbt_prediction.printSchema()

root
 |-- feature: vector (nullable = true)
 |-- Private: string (nullable = true)
 |-- privateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [15]:
#initializing the evaluator to understand the acc of the model
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='privateIndex',metricName='accuracy')

In [16]:
print(f"A single decision tree had an accuracy of: {evaluator.evaluate(dt_prediction)}")
print(f"A random forest ensemble had an accuracy of: {evaluator.evaluate(rf_prediction)}")
print(f"A ensemble using GBT had an accuracy of: {evaluator.evaluate(gbt_prediction)}")

A single decision tree had an accuracy of: 0.9380165289256198
A random forest ensemble had an accuracy of: 0.9462809917355371
A ensemble using GBT had an accuracy of: 0.9297520661157025


23/11/27 10:17:05 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
