#Task 1

#Covid 19 data set


In [None]:
# pyspark_script.py

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, max, min, avg, to_date

1. Start Spark Session

In [None]:
spark = SparkSession.builder \
    .appName("Big Data Analysis - CODTECH Internship") \
    .getOrCreate()

 2. Load the Dataset

In [None]:
df = spark.read.csv("/content/complete.csv", header=True, inferSchema=True)

3. Basic Cleanup: Fix date and data types

In [None]:
df = df.withColumn("Date", to_date(col("Date"), "yyyy-MM-dd"))
df = df.withColumn("Death", col("Death").cast("int"))

 4. Show Schema and Initial Data

In [None]:
df.printSchema()
df.show(5)

root
 |-- Date: date (nullable = true)
 |-- Name of State / UT: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Total Confirmed cases: double (nullable = true)
 |-- Death: integer (nullable = true)
 |-- Cured/Discharged/Migrated: double (nullable = true)
 |-- New cases: integer (nullable = true)
 |-- New deaths: integer (nullable = true)
 |-- New recovered: integer (nullable = true)

+----------+------------------+--------+---------+---------------------+-----+-------------------------+---------+----------+-------------+
|      Date|Name of State / UT|Latitude|Longitude|Total Confirmed cases|Death|Cured/Discharged/Migrated|New cases|New deaths|New recovered|
+----------+------------------+--------+---------+---------------------+-----+-------------------------+---------+----------+-------------+
|2020-01-30|            Kerala| 10.8505|  76.2711|                  1.0|    0|                      0.0|        0|         0|     

5. Aggregation: Total Cases, Deaths, Recoveries per State

In [None]:
summary_df = df.groupBy("Name of State / UT").agg(
    sum("Total Confirmed cases").alias("Total Confirmed"),
    sum("Death").alias("Total Deaths"),
    sum("Cured/Discharged/Migrated").alias("Total Recovered")
).orderBy(col("Total Confirmed").desc())

summary_df.show(10)

+------------------+---------------+------------+---------------+
|Name of State / UT|Total Confirmed|Total Deaths|Total Recovered|
+------------------+---------------+------------+---------------+
|       Maharashtra|    1.5192247E7|      587648|      8145889.0|
|        Tamil Nadu|      7847083.0|      111765|      5204625.0|
|             Delhi|      5766124.0|      171177|      4205359.0|
|    Andhra Pradesh|      2742054.0|       30560|      1367852.0|
|         Karnataka|      2733901.0|       51221|      1129397.0|
|           Gujarat|      2730710.0|      137790|      1824579.0|
|     Uttar Pradesh|      2462456.0|       56959|      1474322.0|
|         Telangana|      1644466.0|       17538|      1111473.0|
|         Rajasthan|      1622247.0|       32326|      1145351.0|
|       West Bengal|      1602230.0|       50953|       989262.0|
+------------------+---------------+------------+---------------+
only showing top 10 rows



6. Trend Analysis: Total new cases per day (National trend)

In [None]:
daily_trend = df.groupBy("Date").agg(
    sum("New cases").alias("Daily New Cases"),
    sum("New deaths").alias("Daily Deaths"),
    sum("New recovered").alias("Daily Recoveries")
).orderBy("Date")

daily_trend.show(10)

+----------+---------------+------------+----------------+
|      Date|Daily New Cases|Daily Deaths|Daily Recoveries|
+----------+---------------+------------+----------------+
|2020-01-30|              0|           0|               0|
|2020-01-31|              0|           0|               0|
|2020-02-01|              1|           0|               0|
|2020-02-02|              1|           0|               0|
|2020-02-03|              0|           0|               0|
|2020-02-04|              0|           0|               0|
|2020-02-05|              0|           0|               0|
|2020-02-06|              0|           0|               0|
|2020-02-07|              0|           0|               0|
|2020-02-08|              0|           0|               0|
+----------+---------------+------------+----------------+
only showing top 10 rows



7. Insights

In [None]:
most_affected = summary_df.orderBy(col("Total Confirmed").desc()).first()
least_affected = summary_df.orderBy(col("Total Confirmed").asc()).first()

print(f"Most affected state: {most_affected['Name of State / UT']} with {most_affected['Total Confirmed']} cases")
print(f"Least affected state: {least_affected['Name of State / UT']} with {least_affected['Total Confirmed']} cases")


Most affected state: Maharashtra with 15192247.0 cases
Least affected state: Union Territory of Chandigarh with 2.0 cases


8. Stop Spark session

In [None]:
spark.stop()

#task 2

**Machine** **Learning** **Model**

 task2_ml_prediction.ipynb

1: Import Libraries

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

 Step 2: Load Data

In [6]:
df = pd.read_csv("/content/complete.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/content/complete.csv'

In [3]:
df.head()

NameError: name 'df' is not defined

3: Preprocessing

In [None]:
df['Death'] = pd.to_numeric(df['Death'], errors='coerce')  # convert to numeric
df.dropna(inplace=True)

4: Feature Selection

In [None]:
features = ['Total Confirmed cases', 'Cured/Discharged/Migrated', 'Death', 'New deaths', 'New recovered']
X = df[features]
y = df['New cases']

5: Train-Test Split

In [1]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

NameError: name 'train_test_split' is not defined

In [2]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

NameError: name 'RandomForestRegressor' is not defined