# Importing python libs and SparkSession

In [1]:
import glob
from pyspark.sql import SparkSession
import pyspark.pandas as pd
from deep_translator import GoogleTranslator
import pandas as ps
from pyspark.sql.functions import trim,upper,lower,to_date,to_timestamp,transform,split,col
from pyspark.sql.functions import dayofmonth,month,hour
from pyspark.sql.types import StringType,IntegerType
from pyspark.sql.functions import udf,col
from pathlib import Path



spark=SparkSession\
           .builder\
           .appName("SparkSQLTransformApp")\
           .getOrCreate()



In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")
spark.conf.set("spark.sql.execution.arrow.pyspark.selfDestruct.enabled", "true")

In [8]:
# defining path for extracts raw files

EXTRACTSPATH = Path.cwd().parent/"extracts"/"*.xlsx"

# Reading extracts raw files from extracts directory

In [11]:
extract_list = glob.glob(str(EXTRACTSPATH))

extract_list

['C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_10extracts.xlsx',
 'C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_11extracts.xlsx',
 'C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_12extracts.xlsx',
 'C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_6extracts.xlsx',
 'C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_7extracts.xlsx',
 'C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_8extracts.xlsx',
 'C:\\Users\\cvb\\Documents\\automation_python\\PM kisan call center query project\\project\\extracts\\2023_9extracts.xlsx']

# Reading extracts in chunks and appending all dataframes into empty dataframe

In [6]:

frame=ps.DataFrame([])
for chunks in extract_list:
    df=ps.read_excel(chunks,index_col=None)
    frame=frame.append(df)
    print(f'{chunks} appended...')

C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_10extracts.xlsx appended...
C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_11extracts.xlsx appended...
C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_12extracts.xlsx appended...
C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_6extracts.xlsx appended...
C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_7extracts.xlsx appended...
C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_8extracts.xlsx appended...
C:\Users\cvb\Documents\automation_python\PM kisan call center query project\project\extracts\2023_9extracts.xlsx appended...


# Converting Appended pandas Dataframe to Spark DataFrame with Default schema

In [10]:

frame.drop_duplicates().info()

kccFrame = spark.createDataFrame(frame)
kccFrame.printSchema()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1984 entries, 0 to 185
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Season        0 non-null      float64
 1   Sector        1984 non-null   object 
 2   Category      1984 non-null   object 
 3   Crop          1984 non-null   object 
 4   QueryType     1984 non-null   object 
 5   QueryText     1984 non-null   object 
 6   KccAns        1984 non-null   object 
 7   StateName     1984 non-null   object 
 8   DistrictName  1984 non-null   object 
 9   BlockName     1984 non-null   object 
 10  CreatedOn     1984 non-null   object 
dtypes: float64(1), object(10)
memory usage: 186.0+ KB
root
 |-- Season: double (nullable = true)
 |-- Sector: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Crop: string (nullable = true)
 |-- QueryType: string (nullable = true)
 |-- QueryText: string (nullable = true)
 |-- KccAns: string (nullable = true)
 |-

In [11]:
# Transforming kccans with split on '\n' and crop on '(' 

kccFrame1 = kccFrame.withColumn("kcc",split(col("KccAns"),"\n",2).getItem(0))
kccFrame1 = kccFrame1.withColumn("Crops",split(col("Crop"),"\(",2).getItem(0))

In [12]:
columns_list = kccFrame1.columns
columns_list

['Season',
 'Sector',
 'Category',
 'Crop',
 'QueryType',
 'QueryText',
 'KccAns',
 'StateName',
 'DistrictName',
 'BlockName',
 'CreatedOn',
 'kcc',
 'Crops']

# Triming spaces and converting all records to lowercase using loop over each column

In [15]:
for key in columns_list:
    kccFrame1 = kccFrame1.withColumn(key,lower((key)))
    kccFrame1 = kccFrame1.withColumn(key,trim(key))
    print(key+"..Done")

Season..Done
Sector..Done
Category..Done
Crop..Done
QueryType..Done
QueryText..Done
KccAns..Done
StateName..Done
DistrictName..Done
BlockName..Done
CreatedOn..Done
kcc..Done
Crops..Done


In [14]:
kccFrame1.select("Kcc").take(4)

[Row(Kcc='cloudy weather, there is a chance of  showers in your area'),
 Row(Kcc='recommended to spray emamectin benzoate (proclaim) 90 grams /150 -200 litres of water / acre'),
 Row(Kcc='cloudy weather , chance of showers in your area'),
 Row(Kcc='మీ ప్రాంతములో వాతావరణం మబ్బులుగా  ఉంటుంది, జల్లులు  పడే అవకాశం ఉంది.')]

# Splitting column CreatedOn to seperate timeofQuery and Date created

In [17]:

kccFrame1 = kccFrame1.withColumn("createdDate",split('CreatedOn','t',-1)\
               .getItem(0))\
                .withColumn("createdTime",split('CreatedOn','t',-1)\
                .getItem(1))\
                #.take(4)

In [18]:
kccFrame1 = kccFrame1.withColumn("convertedDate",to_date(col("createdDate")))
kccFrame1

DataFrame[Season: string, Sector: string, Category: string, Crop: string, QueryType: string, QueryText: string, KccAns: string, StateName: string, DistrictName: string, BlockName: string, CreatedOn: string, kcc: string, Crops: string, createdDate: string, createdTime: string, convertedDate: date]

# Generating Months,Day,Hour of query created for using pyspark inbuilt datetime functions

In [20]:
kccFrame1=kccFrame1.withColumn("createdMonth",month('convertedDate'))\
.withColumn("createdDay",dayofmonth('convertedDate'))\
.withColumn("createdHour",hour('createdTime'))

kccFrame1.cache()

DataFrame[Season: string, Sector: string, Category: string, Crop: string, QueryType: string, QueryText: string, KccAns: string, StateName: string, DistrictName: string, BlockName: string, CreatedOn: string, kcc: string, Crops: string, createdDate: string, createdTime: string, convertedDate: date, createdMonth: int, createdDay: int, createdHour: int]

In [21]:
kccFrame1.explain()

== Physical Plan ==
InMemoryTableScan [Season#494, Sector#522, Category#550, Crop#578, QueryType#606, QueryText#634, KccAns#662, StateName#690, DistrictName#718, BlockName#746, CreatedOn#774, kcc#802, Crops#830, createdDate#875, createdTime#891, convertedDate#907, createdMonth#981, createdDay#1001, createdHour#1021]
   +- InMemoryRelation [Season#494, Sector#522, Category#550, Crop#578, QueryType#606, QueryText#634, KccAns#662, StateName#690, DistrictName#718, BlockName#746, CreatedOn#774, kcc#802, Crops#830, createdDate#875, createdTime#891, convertedDate#907, createdMonth#981, createdDay#1001, createdHour#1021], StorageLevel(disk, memory, deserialized, 1 replicas)
         +- *(1) Project [trim(lower(trim(lower(cast(Season#66 as string)), None)), None) AS Season#494, trim(lower(trim(lower(Sector#67), None)), None) AS Sector#522, trim(lower(trim(lower(Category#68), None)), None) AS Category#550, trim(lower(trim(lower(Crop#69), None)), None) AS Crop#578, trim(lower(trim(lower(QueryType

# Creating a pyspark regula UDF function for converting few kccans which are in telugu lang to english

In [22]:
def convertTextEng(text):
    return GoogleTranslator(source='te', target='en').translate(text)

#spark.udf.register("convertTextEng",convertTextEng)  for sql dataframe
convertToEng = udf(convertTextEng,StringType()) 


In [201]:
kccFrame1 = kccFrame1.withColumn('kccEng',convertToEng('kcc'))

kccFrame1 = kccFrame1.drop("Season","KccAns","CreatedOn","createdDate")

# Writing final denormalized table to parquet format with partiotioned by months of kcc generated

In [225]:
kccFrame1.write.partitionBy("createdMonth").mode("overwrite").parquet("kcc.parquet")

In [206]:
parDF1=spark.read.parquet("kcc.parquet")
parDF1

In [211]:
parDF1.select("kcc","kccEng").show()

+--------------------+--------------------+
|                 kcc|              kccEng|
+--------------------+--------------------+
|cloudy weather, t...|cloudy weather, t...|
|recommended to sp...|recommended to sp...|
|cloudy weather , ...|cloudy weather, c...|
|మీ ప్రాంతములో వాత...|The weather in yo...|
|recommended to do...|Recommended to do...|
|recommended to sp...|recommended to sp...|
|recommended to sp...|recommended to sp...|
|recommended to sp...|recommended to sp...|
|recommended to sp...|recommended to sp...|
|recommended to sp...|recommended to sp...|
|cloudy weather , ...|cloudy weather, c...|
|echnical name : f...|Technical name : ...|
|crop duration : 1...|crop duration : 1...|
|recommended to sp...|recommended to sp...|
|there is chance o...|there is chance o...|
|recommended  to s...|recommended to sp...|
|మీ ప్రాంతములో వాత...|The weather in yo...|
|recommended to co...|recommended to co...|
|recommended do no...|recommended do no...|
|recommended to sp...|recommende