## PySpark COde Practice

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql.functions import *  # Import the function
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import regexp_replace, col
from google.colab import drive

#### 1. Calculate the difference (in days) between the start_date and end_date.

In [6]:
df = spark.createDataFrame(
      [
        ("2023-01-01", "2023-02-01"),
        ("2023-03-15", "2023-03-20")
      ],
      ["start_date", "end_date"]
    )

df.show()
dif_df = df.withColumn("date_diffrence", datediff(col("end_date"), col("start_date")))
dif_df.show()

+----------+----------+
|start_date|  end_date|
+----------+----------+
|2023-01-01|2023-02-01|
|2023-03-15|2023-03-20|
+----------+----------+

+----------+----------+--------------+
|start_date|  end_date|date_diffrence|
+----------+----------+--------------+
|2023-01-01|2023-02-01|            31|
|2023-03-15|2023-03-20|             5|
+----------+----------+--------------+



#### 2. Extracting Year from a Date Column Sometimes, you may need to extract specific parts of a date, such as the year, especially when grouping data by year.

In [7]:
# Define schema for the dataframe
schema = StructType(
    [
        StructField('event_date', StringType(), True)
    ]
)

df =spark.createDataFrame([("2023-04-15",),
 ("2022-11-30",),
 ("2021-08-25",)], schema)
df.show()

#Convert event date column as date type

df = df.withColumn('event_date', to_date(col('event_date'), 'yyyy-MM-dd'))
df.withColumn('year', year(col('event_date'))).show()




+----------+
|event_date|
+----------+
|2023-04-15|
|2022-11-30|
|2021-08-25|
+----------+

+----------+----+
|event_date|year|
+----------+----+
|2023-04-15|2023|
|2022-11-30|2022|
|2021-08-25|2021|
+----------+----+



####3. Filter Records Based on Date

Filtering records based on specific date ranges is a common operation in time-series data.

#### Scenario:
Filter records where event_date is after 2023-06-01.

In [8]:
# Define schema for the dataframe
schema = StructType([
 StructField("event_date", StringType(), True)
])
# Create DataFrame
df = spark.createDataFrame([
 ("2023-05-15",),
 ("2023-07-20",),
 ("2023-06-05",)
], schema)
df.show()
#Convert event date to data type and filter for dates after '2023-06-01'
df = df.withColumn('event_date', to_date(col("event_date"), 'yyyy-MM-dd'))
df.filter(col('event_date') > '2023-06-01').show()

+----------+
|event_date|
+----------+
|2023-05-15|
|2023-07-20|
|2023-06-05|
+----------+

+----------+
|event_date|
+----------+
|2023-07-20|
|2023-06-05|
+----------+



#### 4. Add Days to Date

Adding a certain number of days to a date is crucial when planning tasks or calculating deadlines.

#### Scenario:
Add 30 days to each date in the order_date column.

In [9]:
df = spark.createDataFrame([
    ("2023-01-10",),
    ("2023-06-15",),
    ("2023-07-30",)
], ["order_date"])

#convert order_date to DataType and add 30 days
df = df.withColumn("order_date", to_date(col('order_date'), 'yyyy-MM-dd'))
add_df = df.withColumn("new_order_date", date_add(col('order_date'), 30))
add_df.show()



+----------+--------------+
|order_date|new_order_date|
+----------+--------------+
|2023-01-10|    2023-02-09|
|2023-06-15|    2023-07-15|
|2023-07-30|    2023-08-29|
+----------+--------------+



####5. Find the Maximum Date in a Column

Finding the latest date in a dataset is helpful when dealing with transaction logs or event timestamps.

#### Scenario

Find the latest payment_date in the column.


In [10]:
df = spark.createDataFrame([
 ("2023-02-15",),
 ("2023-06-25",),
 ("2023-01-10",)
], ["payment_date"])

df.show()
#Convert date into Date type and get the latest payment date
df = df.withColumn('payment_date', to_date(col('payment_date'), "yyyy-MM-dd"))
df.agg(max('payment_date').alias("latest_payment_date")).show()


+------------+
|payment_date|
+------------+
|  2023-02-15|
|  2023-06-25|
|  2023-01-10|
+------------+

+-------------------+
|latest_payment_date|
+-------------------+
|         2023-06-25|
+-------------------+



In [11]:
#df.groupBy("Catagory").agg(sum('price')).alias('total_price').show()

In [12]:
def is_palindrom(n):
  return str(n) == str(n)[::-1]

  print(is_palindrom(121))

In [None]:
df1.join(df2, on='id', 'inner').show

In [13]:
def sqr(x):
  return x*x

sqr_udf = udf(sqr, IntegerType())
df.withColumn("Squre", sqr_udf(df["value"]))

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `value` cannot be resolved. Did you mean one of the following? [`payment_date`].

In [None]:
filterd_df = df.filter(df['age']>30)
filterd_df.show()

In [15]:
from collections import Counter
def word_frequency(word):
  return Counter(word.split())

print(word_frequency('A column or function parameter with name function parameter'))

Counter({'function': 2, 'parameter': 2, 'A': 1, 'column': 1, 'or': 1, 'with': 1, 'name': 1})


In [17]:
def getWordFrequency(sentence):
  word_fre= {}
  words = sentence.split()
  for word in words:
    if word in word_fre:
      word_fre[word] +=1
    else:
      word_fre[word] = 1
  return word_fre

print(getWordFrequency('A column or function parameter with name function parameter'))

{'A': 1, 'column': 1, 'or': 1, 'function': 2, 'parameter': 2, 'with': 1, 'name': 1}
