![](https://databricks.com/wp-content/uploads/2018/12/PySpark-1024x164.png)

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#ff8000;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<h1 style="text-align: center;
           padding: 10px;
              color:white">


Pyspark
</h1>
</div>


In [None]:
!pip install pyspark

In [None]:
!python -m pip install findspark

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#ff8000;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<h1 style="text-align: center;
           padding: 10px;
              color:white">


Create Spark Context
</h1>
</div>


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
spark = SparkSession.builder.getOrCreate()

In [None]:
df = spark.read.csv('../input/credit-card-customers/BankChurners.csv', inferSchema=True, header=True)
df.show(5)

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#ff8000;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<h1 style="text-align: center;
           padding: 10px;
              color:white">


Basic Spark Operations
</h1>
</div>


In [None]:
df.columns

# 1. Schema

In [None]:
df.printSchema()

In [None]:
df.describe('Attrition_Flag').show(10)

In [None]:
df.select('Attrition_Flag','Customer_Age').show(10)

# How to find the mean of each age group in data?

In [None]:
df.groupby('Customer_Age').agg({'Total_Revolving_Bal': 'mean'}).show()

In [None]:
df.groupby('Customer_Age').count().show()

In [None]:
from pyspark.mllib.stat import Statistics

# select variables to check correlation
df_features = df.select("Customer_Age","Total_Trans_Amt","Total_Trans_Ct","Total_Revolving_Bal") 

# create RDD table for correlation calculation
rdd_table = df_features.rdd.map(lambda row: row[0:])

# get the correlation matrix
corr_mat=Statistics.corr(rdd_table, method="pearson")
corr_mat

In [None]:
plt.imshow(corr_mat,cmap='GnBu')

 <div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#ff8000;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<h1 style="text-align: center;
           padding: 10px;
              color:white">



String and Numeric Columns
</h1>
</div>


In [None]:
print('Data frame describe (string and numeric columns only):')
df.describe().toPandas()

print(f'There are total {df.count()} row, Let print first 2 data rows:')
df.limit(2).toPandas()

In [None]:
Months_on_book = df.groupBy('Months_on_book').count()

Total_Revolving_Bal = df.groupBy('Total_Revolving_Bal').count()

In [None]:
Months = pd.DataFrame(Months_on_book.rdd.map(lambda line: line.asDict()).collect()).head(20)
Revolving_Bal = pd.DataFrame(Total_Revolving_Bal.rdd.map(lambda line: line.asDict()).collect()).head(20)

In [None]:
Months.head(10)
Revolving_Bal.head(10)