## Topics


- PySpark Dataframe
- Reading The Dataset
- Checking the Datatypes of the Column(Schema)
- Selecting Columns And Indexing
- Check Describe option similar to Pandas
- Adding Columns
- Dropping columns
- Renaming Columns

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('dataframe').getOrCreate()

In [3]:
spark

In [32]:
#read the dataset
dataframe_spark=spark.read.option('header','true').csv('Updated_company_file',inferSchema=True)

In [33]:
dataframe_spark.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Company Name: string (nullable = true)
 |-- Job Title: string (nullable = true)
 |-- Salaries Reported: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- Salary: string (nullable = true)



In [34]:
#getting column names
dataframe_spark.columns

['_c0', 'Company Name', 'Job Title', 'Salaries Reported', 'Location', 'Salary']

In [35]:
dataframe_spark.head(3)

[Row(_c0=0, Company Name='Mu Sigma', Job Title='Data Scientist', Salaries Reported=105.0, Location='Bangalore', Salary='648573'),
 Row(_c0=1, Company Name='IBM', Job Title='Data Scientist', Salaries Reported=95.0, Location='Bangalore', Salary='1191950'),
 Row(_c0=2, Company Name='Tata Consultancy Services', Job Title='Data Scientist', Salaries Reported=66.0, Location='Bangalore', Salary='836874')]

In [36]:
dataframe_spark.select(['Salaries Reported','Salary']).show()

+-----------------+-------+
|Salaries Reported| Salary|
+-----------------+-------+
|            105.0| 648573|
|             95.0|1191950|
|             66.0| 836874|
|             40.0| 669578|
|             32.0| 944110|
|             30.0| 908764|
|             28.0| 926124|
|             26.0| 736708|
|             25.0|1646721|
|             22.0|1392960|
|             20.0|1404773|
|             19.0|1507343|
|             19.0|1558095|
|             18.0|2557843|
|             16.0|1510081|
|             15.0| 732821|
|             15.0|1221750|
|             15.0|1739484|
|             14.0|1124939|
|             14.0|1630396|
+-----------------+-------+
only showing top 20 rows



In [37]:
dataframe_spark.dtypes

[('_c0', 'int'),
 ('Company Name', 'string'),
 ('Job Title', 'string'),
 ('Salaries Reported', 'double'),
 ('Location', 'string'),
 ('Salary', 'string')]

In [38]:
dataframe_spark.describe().show()

+-------+------------------+---------------+--------------------+------------------+---------+-----------------+
|summary|               _c0|   Company Name|           Job Title| Salaries Reported| Location|           Salary|
+-------+------------------+---------------+--------------------+------------------+---------+-----------------+
|  count|              4344|           4341|                4344|              4342|     4344|             4344|
|   mean|            2171.5|           null|                null|2.7759097190234914|     null|768129.6433341008|
| stddev|1254.1491139414004|           null|                null|5.1465274725498755|     null|769308.5390143887|
|    min|                 0|& Other Stories|Associate Machine...|               1.0|Bangalore|              100|
|    max|              4343|√Åguas Guariroba|Software Engineer...|             105.0|     Pune|        FN 156179|
+-------+------------------+---------------+--------------------+------------------+---------+-

In [39]:
#Adding column in dataframe
dataframe_spark.withColumn('new_location',dataframe_spark['Location']+'new').show()

+---+--------------------+--------------+-----------------+---------+-------+------------+
|_c0|        Company Name|     Job Title|Salaries Reported| Location| Salary|new_location|
+---+--------------------+--------------+-----------------+---------+-------+------------+
|  0|            Mu Sigma|Data Scientist|            105.0|Bangalore| 648573|        null|
|  1|                 IBM|Data Scientist|             95.0|Bangalore|1191950|        null|
|  2|Tata Consultancy ...|Data Scientist|             66.0|Bangalore| 836874|        null|
|  3|    Impact Analytics|Data Scientist|             40.0|Bangalore| 669578|        null|
|  4|           Accenture|Data Scientist|             32.0|Bangalore| 944110|        null|
|  5|             Infosys|Data Scientist|             30.0|Bangalore| 908764|        null|
|  6|           Capgemini|Data Scientist|             28.0|Bangalore| 926124|        null|
|  7|Cognizant Technol...|Data Scientist|             26.0|Bangalore| 736708|        null|

In [40]:
dataframe_spark=dataframe_spark.drop('_c0')

In [41]:
dataframe_spark.show()

+--------------------+--------------+-----------------+---------+-------+
|        Company Name|     Job Title|Salaries Reported| Location| Salary|
+--------------------+--------------+-----------------+---------+-------+
|            Mu Sigma|Data Scientist|            105.0|Bangalore| 648573|
|                 IBM|Data Scientist|             95.0|Bangalore|1191950|
|Tata Consultancy ...|Data Scientist|             66.0|Bangalore| 836874|
|    Impact Analytics|Data Scientist|             40.0|Bangalore| 669578|
|           Accenture|Data Scientist|             32.0|Bangalore| 944110|
|             Infosys|Data Scientist|             30.0|Bangalore| 908764|
|           Capgemini|Data Scientist|             28.0|Bangalore| 926124|
|Cognizant Technol...|Data Scientist|             26.0|Bangalore| 736708|
|Anheuser-Busch InBev|Data Scientist|             25.0|Bangalore|1646721|
|             Fractal|Data Scientist|             22.0|Bangalore|1392960|
|              Embibe|Data Scientist| 

In [42]:
#rename the column
dataframe_spark=dataframe_spark.withColumnRenamed('Salaries Reported','No_of_salary')

In [43]:
dataframe_spark.show()

+--------------------+--------------+------------+---------+-------+
|        Company Name|     Job Title|No_of_salary| Location| Salary|
+--------------------+--------------+------------+---------+-------+
|            Mu Sigma|Data Scientist|       105.0|Bangalore| 648573|
|                 IBM|Data Scientist|        95.0|Bangalore|1191950|
|Tata Consultancy ...|Data Scientist|        66.0|Bangalore| 836874|
|    Impact Analytics|Data Scientist|        40.0|Bangalore| 669578|
|           Accenture|Data Scientist|        32.0|Bangalore| 944110|
|             Infosys|Data Scientist|        30.0|Bangalore| 908764|
|           Capgemini|Data Scientist|        28.0|Bangalore| 926124|
|Cognizant Technol...|Data Scientist|        26.0|Bangalore| 736708|
|Anheuser-Busch InBev|Data Scientist|        25.0|Bangalore|1646721|
|             Fractal|Data Scientist|        22.0|Bangalore|1392960|
|              Embibe|Data Scientist|        20.0|Bangalore|1404773|
|              Amazon|Data Scienti