# Importing python libs and SparkSession

In [1]:
import glob
from pyspark.sql import SparkSession
import pyspark.pandas as pd
from deep_translator import GoogleTranslator
import pandas as ps
from pyspark.sql.functions import trim,upper,lower,to_date,to_timestamp,transform,split,col
from pyspark.sql.functions import dayofmonth,month,hour
from pyspark.sql.types import StringType,IntegerType
from pyspark.sql.functions import udf,col
from pathlib import Path



spark=SparkSession\
           .builder\
           .appName("SparkSQLTransformApp")\
           .getOrCreate()



In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")
spark.conf.set("spark.sql.execution.arrow.pyspark.selfDestruct.enabled", "true")

In [15]:
# defining path for extracts raw files and output staging lake

EXTRACTSPATH = Path.cwd().parent/"EXTRACTS_RAW"/"*.xlsx"

STAGINGPATH = Path.cwd().parent/"STAGING_LAKE"


C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\EXTRACTS_RAW\*.xlsx


# Reading extracts raw files from extracts directory

In [11]:
extract_list = glob.glob(str(EXTRACTSPATH))

#extract_list

['C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_10extracts.xlsx',
 'C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_11extracts.xlsx',
 'C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_12extracts.xlsx',
 'C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_6extracts.xlsx',
 'C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_7extracts.xlsx',
 'C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_8extracts.xlsx',
 'C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_9extracts.xlsx']

# Reading extracts in chunks and appending all dataframes into empty dataframe

In [6]:
frame=ps.DataFrame([])
for chunks in extract_list:
    df=ps.read_excel(chunks,index_col=None)
    frame=frame.append(df)
    print(f'{chunks} appended...')

C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_10extracts.xlsx appended...
C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_11extracts.xlsx appended...
C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_12extracts.xlsx appended...
C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_6extracts.xlsx appended...
C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_7extracts.xlsx appended...
C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_8extracts.xlsx appended...
C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_9extracts.xlsx appended...


# Converting Appended pandas Dataframe to Spark DataFrame with Default schema

In [12]:
frame.drop_duplicates()

kccFrame = spark.createDataFrame(frame)
kccFrame.printSchema()

NameError: name 'frame' is not defined

In [11]:
# Transforming kccans with split on '\n' and crop on '(' 

kccFrame1 = kccFrame.withColumn("kcc",split(col("KccAns"),"\n",2).getItem(0))
kccFrame1 = kccFrame1.withColumn("Crops",split(col("Crop"),"\(",2).getItem(0))

In [12]:
columns_list = kccFrame1.columns
columns_list

['Season',
 'Sector',
 'Category',
 'Crop',
 'QueryType',
 'QueryText',
 'KccAns',
 'StateName',
 'DistrictName',
 'BlockName',
 'CreatedOn',
 'kcc',
 'Crops']

# Triming spaces and converting all records to lowercase using loop over each column

In [15]:
for key in columns_list:
    kccFrame1 = kccFrame1.withColumn(key,lower((key)))
    kccFrame1 = kccFrame1.withColumn(key,trim(key))
    print(key+"..Done")

Season..Done
Sector..Done
Category..Done
Crop..Done
QueryType..Done
QueryText..Done
KccAns..Done
StateName..Done
DistrictName..Done
BlockName..Done
CreatedOn..Done
kcc..Done
Crops..Done


# Splitting column CreatedOn to seperate timeofQuery and Date created

In [17]:

kccFrame1 = kccFrame1.withColumn("createdDate",split('CreatedOn','t',-1)\
               .getItem(0))\
                .withColumn("createdTime",split('CreatedOn','t',-1)\
                .getItem(1))\
                #.take(4)

In [18]:
kccFrame1 = kccFrame1.withColumn("convertedDate",to_date(col("createdDate")))
kccFrame1

DataFrame[Season: string, Sector: string, Category: string, Crop: string, QueryType: string, QueryText: string, KccAns: string, StateName: string, DistrictName: string, BlockName: string, CreatedOn: string, kcc: string, Crops: string, createdDate: string, createdTime: string, convertedDate: date]

# Generating Months,Day,Hour of query created for using pyspark inbuilt datetime functions

In [20]:
kccFrame1=kccFrame1.withColumn("createdMonth",month('convertedDate'))\
.withColumn("createdDay",dayofmonth('convertedDate'))\
.withColumn("createdHour",hour('createdTime'))

kccFrame1.cache()

DataFrame[Season: string, Sector: string, Category: string, Crop: string, QueryType: string, QueryText: string, KccAns: string, StateName: string, DistrictName: string, BlockName: string, CreatedOn: string, kcc: string, Crops: string, createdDate: string, createdTime: string, convertedDate: date, createdMonth: int, createdDay: int, createdHour: int]

# Creating a pyspark regula UDF function for converting few kccans which are in telugu lang to english

In [22]:
def convertTextEng(text):
    return GoogleTranslator(source='te', target='en').translate(text)

#spark.udf.register("convertTextEng",convertTextEng)  for sql dataframe
convertToEng = udf(convertTextEng,StringType()) 


In [201]:
kccFrame1 = kccFrame1.withColumn('kccEng',convertToEng('kcc'))

kccFrame1 = kccFrame1.drop("Season","KccAns","CreatedOn","createdDate")

# Writing final denormalized table to parquet format with partiotioned by months of kcc generated

In [225]:
kccFrame1.write.partitionBy("createdMonth").mode("overwrite").parquet(str(STAGINGPATH)+"/kcc.parquet")

In [28]:
#parDF1.select("createdMonth").distinct().write.parquet(str(STAGINGPATH)+"/kcc.parquet")