In [1]:
!pip install -q findspark
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DoubleType

# 01_Reading_sample_Data

In [4]:
def get_schema(d):
    new_schema = StructType()
    for key, value in d.items():
        new_schema.add(key,value,True)
    return new_schema

d = {'year':IntegerType(),'imdb':StringType(),'title':StringType(),'test':StringType(),'clean_test':StringType(),
    'binary':StringType(),'budget':DoubleType(),'domgross':StringType(),'intgross':StringType(),'code':StringType(),
    'budget_2013$':DoubleType(),'domgross_2013$':StringType(),'intgross_2013$':StringType(),'period code':IntegerType(),
    'decade code':IntegerType()}
df=spark.read.csv("movies.csv",header=True,schema=get_schema(d))
df = df.toDF(*[col.replace(' ','_').replace(':','').replace('-','_').replace('$','') for col in df.columns])

# 02_Questions

1. Initial Syntax for creating a sample dataframe.
2. How to add a new column.
3. How to rename a column.
4. How to find the max, min, avg of a column in that DF

In [34]:
#1. Initial Syntax for creating a sample dataframe. 
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DoubleType

In [24]:
#1. Initial Syntax for creating a sample dataframe. 
data2 = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]
columns = ["firstname","middlename","lastname2","id","gender","salary"]
df = spark.createDataFrame(data2,columns)
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname2: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [25]:
#OR we can use schema:
schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
df = spark.createDataFrame(data=data2,schema=schema)
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [36]:
df.printSchema()
df.withColumn('salary',df.salary.cast(DoubleType())).show(2)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|3000.0|
|  Michael|      Rose|        |40288|     M|4000.0|
+---------+----------+--------+-----+------+------+
only showing top 2 rows



In [8]:
#2. How to add a new column.
df.withColumn("year2",df.year-1000).show(3)

+----+---------+----------------+---------------+----------+------+------+--------+---------+--------+-----------+-------------+-------------+-----------+-----------+-----+
|year|     imdb|           title|           test|clean_test|binary|budget|domgross| intgross|    code|budget_2013|domgross_2013|intgross_2013|period_code|decade_code|year2|
+----+---------+----------------+---------------+----------+------+------+--------+---------+--------+-----------+-------------+-------------+-----------+-----------+-----+
|2013|tt1711425|   21 &amp; Over|         notalk|    notalk|  FAIL| 1.3E7|25682380| 42195766|2013FAIL|      1.3E7|     25682380|     42195766|          1|          1| 1013|
|2012|tt1343727|        Dredd 3D|    ok-disagree|        ok|  PASS| 4.5E7|13414714| 40868994|2012PASS|4.5658735E7|     13611086|     41467257|          1|          1| 1012|
|2013|tt2024544|12 Years a Slave|notalk-disagree|    notalk|  FAIL| 2.0E7|53107035|158607035|2013FAIL|      2.0E7|     53107035|    158

In [11]:
#3. How to rename a column.
print(df.withColumnRenamed("year","this_year").columns)

['this_year', 'imdb', 'title', 'test', 'clean_test', 'binary', 'budget', 'domgross', 'intgross', 'code', 'budget_2013', 'domgross_2013', 'intgross_2013', 'period_code', 'decade_code']


In [19]:
#4. How to find the max, min, avg of a column in that DF
print(df.agg({'year':'max'}).collect()[0][0])
print(df.agg({'year':'min'}).collect()[0][0])
print(df.agg({'year':'avg'}).collect()[0][0])

2013
1970
2002.552396878484
