In [13]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions

In [2]:
conf=SparkConf().setAppName('Spark DataFrames')
sc=SparkContext(conf=conf)

In [3]:
sparksession=SparkSession.builder.appName('DataFrames').config('"spark.some.config.option","some-value"').getOrCreate()

In [4]:
data=sparksession.read.csv('Fake_data.csv',header=True,inferSchema=True)

In [5]:
data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Birth_Country: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- First_Name: string (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Job: string (nullable = true)
 |-- Last_name: string (nullable = true)
 |-- Loan_Approved: boolean (nullable = true)
 |-- SSN: string (nullable = true)



In [10]:
#1. Birth Country with highest people
print('\n Birth Country with highest people')
print(data.groupBy('Birth_Country').count().orderBy('count',ascending=False).show())


 Birth Country with highest people
+--------------------+-----+
|       Birth_Country|count|
+--------------------+-----+
|               Korea|   91|
|               Congo|   84|
|    Christmas Island|   57|
|             Ireland|   55|
|United Arab Emirates|   55|
|        Saint Helena|   53|
|               Egypt|   53|
|         Puerto Rico|   52|
|          Cape Verde|   52|
|             Myanmar|   51|
|            Thailand|   51|
|       Liechtenstein|   51|
|    French Polynesia|   51|
|              Serbia|   51|
|Heard Island and ...|   51|
|             Eritrea|   51|
|             Burundi|   50|
|                Togo|   50|
|              Gambia|   50|
|               Yemen|   49|
+--------------------+-----+
only showing top 20 rows

None


In [27]:
#2.Avg income of people who are born in USA
print('\n Average income of people who are born in usa')
print(data.filter(data['Birth_Country']=='United States of America').agg({"Income":"avg"}).show())


 Average income of people who are born in usa
+------------------+
|       avg(Income)|
+------------------+
|208759.82352941178|
+------------------+

None


In [32]:
#3.How many people have income more than 100000 whose loan is not approved
print('\n Number of people with income greater than 100000 but loan not approved')
print(data.filter((data['Income']>100000) &(data['Loan_Approved']==False)).count())


 Number of people with income greater than 100000 but loan not approved
4009


In [37]:
#4.Find top 10 people with highest income in USA
print('\n Top 10 people with highest income in USA')

print(data.select('First_Name','Last_Name','Income','Birth_Country').filter(data['Birth_Country']=='United States of America').orderBy(['Income'],ascending=False).limit(10).show())




 Top 10 people with highest income in USA
+----------+---------+------+--------------------+
|First_Name|Last_Name|Income|       Birth_Country|
+----------+---------+------+--------------------+
|    Alyssa|   Miller|482588|United States of ...|
|    Hunter|    Walls|468946|United States of ...|
|      Rose|Henderson|426115|United States of ...|
|  Danielle|  Leonard|389810|United States of ...|
|     Terry|    Klein|380410|United States of ...|
|     Cindy|   Newton|370322|United States of ...|
|     Scott| Mitchell|368913|United States of ...|
|   Christy| Sandoval|355150|United States of ...|
|     Kelly| Reynolds|341448|United States of ...|
|  Kristina|    Smith|338804|United States of ...|
+----------+---------+------+--------------------+

None


In [39]:
#5. Number of distinct jobs
print('Number of distinct jobs')
print(data.select('Job').distinct().count())

Number of distinct jobs
639


In [40]:
#6. How many writers earn less than 100000
print('Number of writers earn less than 100000')
print(data.select('SSN','Income').filter(data['Income']<100000).count())

Number of writers earn less than 100000
2033


In [41]:
sc.stop()