In [0]:
from pyspark.sql.functions import col
user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()
file_location = f'/Workspace/Users/{user_name}/data/telco.csv'
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

from pyspark.sql.functions import trim, when 
#df_tc_dbl = df.withColumn('TotalCharges', trim(col("TotalCharges")))

df_tc_dbl = df.withColumn("TotalCharges", when(col('TotalCharges')==" " , None) \
          .otherwise(col('TotalCharges')))
df_tc_dbl = df_tc_dbl.withColumn('TotalCharges', col('TotalCharges').cast('double'))

display(df_tc_dbl)

In [0]:
df_selection = df_tc_dbl.select('Contract', 'PhoneService', 'InternetService', 'TotalCharges', 'MonthlyCharges', 'tenure')
#display(df_selection)
#df_filtered = df_selection.filter( \
#    (df_selection['Contract'] == 'One year') & (df_selection['MonthlyCharges'] > 20))

df_filtered = df_selection.filter( \
    ((df_selection['Contract'] == 'One year') | (df_selection['Contract'] == 'Two year')) & (df_selection['MonthlyCharges'] > 110))  
# | - OR, & - AND

#df_filtered = df_selection.filter( \
#    ((df_selection['Contract'] == 'One year')

df_filtered = df_filtered.sort(df_filtered['MonthlyCharges'].desc())

from pyspark.sql.functions import desc, asc
df_filtered = df_filtered.sort(asc(df_filtered['Contract']), desc(df_filtered['MonthlyCharges']))
#df_filtered = df_filtered.sort(df_filtered['MonthlyCharges'].asc())

display(df_filtered)

In [0]:
df_tc_dbl.groupBy('Contract', 'InternetService').count().sort(asc('Contract')).show()

In [0]:

from pyspark.sql.functions import avg, count, round
# TODO: Contract, PhoneService, avg MonthlyCharges rounded 2 digits after decimal, count
#df_tc_dbl.groupBy('Contract', 'PhoneService').agg({'MonthlyCharges': 'avg'})
df_res = df_tc_dbl.groupBy('Contract', 'PhoneService').agg(round(avg('MonthlyCharges'), 2).alias('Avg MonthlyCharges'), count('Contract').alias('Count'))
#.sort(asc('Contract')).show()
# TODO: Contract, PhoneService)
display(df_res)

Databricks visualization. Run in Databricks to view.

In [0]:
#type(df_res)
df_res_coll = df_res.collect()
print(type(df_res_coll))
print(df_res_coll)
for row in df_res_coll:
    print(row)
    row_dict = row.asDict()
    val = row['Avg MonthlyCharges']
    print(type(val))
    print(row['Avg MonthlyCharges'])
    print(row_dict['Avg MonthlyCharges'])
    print(row_dict.get('Avg MonthlyCharges'))
    



In [0]:
#df_tc_dbl.write.parquet('/Workspace/Users/pawel.rubach@sgh.waw.pl/df.parquet')

#fileOutPath = "/Workspace/Users/pawel.rubach@sgh.waw.pl/data/telco_filtered.tsv.gz"
#df_tc_dbl.coalesce(1).write.option("header", True).option("delimiter", "\t").option("compression", "gzip", format('csv').save(fileOutPath)

pd_res = df_res.toPandas()
pd_res.to_csv('/Workspace/Users/pawel.rubach@sgh.waw.pl/data/telco_agg_res.csv')


In [0]:
%sh
ls -al /Workspace

In [0]:
df_tc_mc = df_tc_dbl.withColumn('Tenure x Monthly Charges', round(df_tc_dbl['tenure'] * df_tc_dbl['MonthlyCharges'], 2))

df_tc_mc = df_tc_mc.withColumn('Diff Charges', round(col('Tenure x Monthly Charges') - col('TotalCharges'), 2))

display(df_tc_mc.limit(100))

eq = df_tc_mc.filter(col('Diff Charges')==0).count()
neq = df_tc_mc.filter(col('Diff Charges')!=0).count()
print(f'eq={eq}, neq={neq}')

In [0]:
from pyspark.sql.functions import lit
df_tc_mc = df_tc_mc.withColumn('Fixed', lit('15'))
display(df_tc_mc)

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

@udf(returnType=StringType())
def convert_contract(cntr):
    cntr = cntr.replace('Month-to-month', 'monthly')
    cntr = cntr.replace('One', '1')
    cntr = cntr.replace('Two', '2')
    return cntr

df_res2 = df_tc_mc.withColumn('Contract Short', convert_contract('Contract'))
display(df_res2)


TODO: Check if customers with dependents pay on average more than those without dependents. The answer should be given as a single bool value (True or False)