In [None]:
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession.builder.appName('spark_test_notes').getOrCreate()
import os
print(os.listdir("../input"))

In [None]:
df = spark.read.csv("../input/metro-bike-share-trip-data.csv",inferSchema=True,header=True)

In [None]:
summary_df=df.describe()

In [None]:
summary_df.toPandas()

In [None]:
df.printSchema()

In [None]:
from pyspark.sql.functions import desc
df.groupBy("Trip ID").count().sort(desc("count")).show(5)

In [None]:
df.select("Duration").describe().show()

In [None]:
from pyspark.sql.functions import desc
df.groupBy("Starting Station ID").count().sort(desc("count")).show(5)

In [None]:
from pyspark.sql.functions import desc
df.groupBy("Ending Station ID").count().sort(desc("count")).show(5)

In [None]:
from pyspark.sql.functions import desc
df.groupBy("Bike ID").count().sort(desc("count")).show(5)
df.groupBy("Plan Duration").count().sort(desc("count")).show()

In [None]:
from pyspark.sql.functions import desc
df.groupBy("Trip Route Category").count().sort(desc("count")).show()
df.groupBy("Passholder Type").count().sort(desc("count")).show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
PT = df.groupBy("Passholder Type").count().sort(desc("count"))
PT = PT.toPandas()
sns.barplot(x="Passholder Type",y="count",data=PT)
plt.title("Distribution by Passholder Type")

In [None]:
sub_df = df.select("Trip ID","Starting Station ID","Ending Station ID")

In [None]:
sub_df.head(5)

In [None]:
sub_df.select("Trip ID").count()

In [None]:
sub_df = sub_df.dropna()
sub_df.count()

In [None]:
sub_df.groupBy(["Starting Station ID","Ending Station ID"]).count().sort(desc("count")).show()

In [None]:
sub_df =sub_df.filter(sub_df["Starting Station ID"]!=sub_df["Ending Station ID"])

In [None]:
sub_df = sub_df.dropDuplicates(["Starting Station ID","Ending Station ID"])
sub_df.count()

In [None]:
sub_df.filter(sub_df["Starting Station ID"]==sub_df["Ending Station ID"]).count()

In [None]:
sub_df.filter(sub_df["Starting Station ID"].between(3000,3005)).show()

In [None]:
sub_df = sub_df.withColumnRenamed("Trip ID","id")
sub_df.head(5)

In [None]:
from pyspark.sql.functions import *
sub_df = sub_df.withColumn("items",array("Starting Station ID","Ending Station ID"))

In [None]:
sub_df.show(5)
sub_df.printSchema()

In [None]:
sub_df = sub_df.select("id","items")
sub_df.show(5)
sub_df.printSchema()
sub_df.count()

In [None]:
sub_df = sub_df.dropDuplicates(["items"])
sub_df.count()

In [None]:
from pyspark.ml.fpm import FPGrowth

In [None]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.0001, minConfidence=0.0001)
model = fpGrowth.fit(sub_df)

In [None]:
model.freqItemsets.sort(desc("freq")).show()

In [None]:
model.associationRules.sort(desc("confidence")).show(25)

In [None]:
from sklearn.datasets import load_iris

In [None]:
idf = load_iris()

In [None]:
features = pd.DataFrame(idf["data"])
labels = pd.DataFrame(idf["target"])

In [None]:
features.columns =(["Sepal_Lenght","Sepal_Width","Petal_Length","Petal_Width"])
labels.columns = (["label"])

In [None]:
iris_df = pd.concat([features,labels],axis=1)
iris_df.tail()

In [None]:
iris = spark.createDataFrame(iris_df)

In [None]:
names = iris.drop('label').columns

In [None]:
iris.columns

In [None]:
iris.show(10)

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

In [None]:
assembler = VectorAssembler(inputCols=["Sepal_Lenght","Sepal_Width","Petal_Length","Petal_Width"],outputCol="features")

In [None]:
iris = assembler.transform(iris)

In [None]:
iris = iris.select("label","features")
iris.show(5)

In [None]:
(training,testing) = iris.randomSplit([0.7, 0.3],seed=1234)
training
testing

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
dtc = RandomForestClassifier(featuresCol="features",labelCol="label",seed=1234,numTrees=25)

In [None]:
pipeline = Pipeline(stages=[dtc])

In [None]:
dtc_model = pipeline.fit(training)

In [None]:
dtc_predictions = dtc_model.transform(testing)

In [None]:
dtc_predictions.show(5)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [None]:
accuracy = evaluator.evaluate(dtc_predictions)
print("Accuracy: ",accuracy)
print("Test Error = %g" % (1.0 - accuracy))

In [None]:
dtc_predictions.groupBy("label").pivot("prediction").count().show()