# ***Pre Steps***

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# ***EX_1***

## ***Random Data.***

In [None]:
np.random.seed(42)
values=np.random.randn(100)
values

In [None]:
dates=pd.date_range('2010-01-01',periods=len(values),freq='D')

In [None]:
data1=pd.DataFrame(values,index=dates,columns=["Values"])
data1

## ***Dataset Data***

In [None]:
data1=pd.read_csv("data/raw_sales.csv")
display("First Five Rows :",data1.head())
display("Last Five Rows :",data1.tail())

In [None]:
#summary Statistics
display("Summary Statistics :",data1.describe())

In [None]:
#Filter data for a specific year
data1["datesold"]=pd.to_datetime(data1['datesold'])
'''
query_year=int(input("Enter year : "))
'''
query_year=2015
display(data1[data1["datesold"].dt.year==query_year])

In [None]:
# Plot the average price per year
avg_price_per_year=data1.groupby(data1["datesold"].dt.year)["price"].mean().reset_index(name="Average Price")
display(avg_price_per_year)
plt.plot(avg_price_per_year["datesold"],avg_price_per_year["Average Price"])
plt.title("Average Price per Year")
plt.xlabel("Year")
plt.ylabel("Average Price")
plt.show()

In [None]:
#Count of properties sold per year
property_count_per_year=data1.groupby(data1["datesold"].dt.year).size().reset_index(name="Properties_sold")
property_count_per_year.rename(columns={"datesold":"Year"},inplace=True)
display(property_count_per_year)
plt.plot(property_count_per_year["Year"],property_count_per_year["Properties_sold"])
plt.title("Properties Sold per Year")
plt.xlabel("Year")
plt.ylabel("Properties Sold")
plt.show()

In [None]:
#Query for a specific date range (e.g., Jan 2010 to Dec 2015)
'''
start_date=input("Enter date in format yyyy-mm-dd : ").split("-")
start_date=datetime.datetime(int(start_date[0]),int(start_date[1]),int(start_date[2]))
end_date=input("Enter date in format yyyy-mm-dd : ").split("-")
end_date=datetime.datetime(int(end_date[0]),int(end_date[1]),int(end_date[2]))
'''
start_date=datetime.datetime(2010,5,1)
end_date=datetime.datetime(2015,1,1)
display(data1[(data1["datesold"]>=start_date) & (data1["datesold"]<=end_date)])

In [None]:
#Calculate the mean price month-wise (use Groupby)
mean_price_by_month=data1.groupby(data1["datesold"].dt.month)["price"].mean().reset_index(name="Average per by month")
mean_price_by_month.rename(columns={"datesold":"Month"},inplace=True)
display(mean_price_by_month)
plt.plot(mean_price_by_month["Month"],mean_price_by_month["Average per by month"])
plt.title("Average Price per Month")
plt.xlabel("Month")
plt.ylabel("Average Price")
plt.show()

In [None]:
#Perform a histogram plot
plt.hist(data1["price"],bins=20)
plt.title("Histogram of price")
plt.xlabel("price")
plt.ylabel("Frequency")
plt.show()

In [None]:
#Print the property price > 5Lakhs
display(data1[(data1["price"]>500000)])

# ***EX_2***

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
data2=pd.read_csv("data/shampoo_sales.csv")
display(data2.head())

In [None]:
#Perform basic Exploratory Data Analysis.

data2.info()
display("Summary Statistics : ",data2.describe())
display("No of Missing Values :",data2.isnull().sum().reset_index(name=" No of Missing Values"))
data2=data2.dropna()#removing missing data if they exist.
print(f"\n\nNo of Duplicates in Dataset : {data2.duplicated().sum()}\n\n")
data2=data2.drop_duplicates()#removing duplicates if available.
display("First Five Rows : ",data2.head())
display("Last Five Rows : ",data2.tail())

In [None]:
#Perform date and lag based features
data2["Date"] = pd.to_datetime(data2["Month"],format="%m-%y")
data2["Month"]=data2["Date"].dt.month
data2["Year"]=data2["Date"].dt.year
display(data2.head())
display(data2.tail())

In [None]:
data2["Sales"].plot(kind="hist",bins=20,title="Histogram of Sales")
ax=plt.gca()
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
plt.xlabel("Sales")
plt.show()

In [None]:
data2["Sales"].plot(kind='line',title="sales")
plt.xlabel("Sales")
plt.ylabel("Values")
plt.show()

In [None]:
#lag.
data2['lag_1']=data2["Sales"].shift(1)
data2["lag_2"]=data2['Sales'].shift(3)
display(data2.head())

In [None]:
#rolling.
data2["rolling_mean_5"]=data2['Sales'].rolling(5).mean()
data2["rolling_min_5"]=data2['Sales'].rolling(5).min()
data2["rolling_max_5"]=data2['Sales'].rolling(5).max()
data2["rolling_std_5"]=data2['Sales'].rolling(5).std()
display(data2.head(10))

In [None]:
data2["expanding_mean"]=data2['Sales'].expanding().mean()
data2['expanding_min']=data2['Sales'].expanding().min()
data2['expanding_max']=data2['Sales'].expanding().max()
data2['expanding_std']=data2['Sales'].expanding().std()
display(data2.head(10))

In [None]:
#drop missing values.
data2.dropna(inplace=True)
display(data2.head())

In [None]:
#correlation matrix for feature extraction.
from sklearn.preprocessing import StandardScaler
Scaler=StandardScaler()
data_numeric=data2.select_dtypes(include=['number'])
Scaled_data=pd.DataFrame(Scaler.fit_transform(data_numeric),columns=data_numeric.columns,index=data_numeric.index)
display(Scaled_data.head())

sns.heatmap(Scaled_data.corr(),annot=True,cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# ***EX_3***

In [None]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.arima.model import ARIMA

In [None]:
data3=pd.read_csv("data/daily-min-temperatures.csv")
display(data3.head())

In [None]:
#Find how time series values differs over[S1]  time by analyzing the key aspects of temporal relationships such as trends, seasonality,lags and cycles.
data3["Temp"].plot(title="Daily Temperature")
plt.show()

plt.scatter(data3["Date"],data3["Temp"])
plt.title("Daily Temperature")
plt.show()

In [None]:
plot_acf(data3["Temp"],lags=50)
plt.show()

plot_pacf(data3["Temp"],lags=50)
plt.show()

In [None]:
ar_model=ARIMA(data3["Temp"],order=(2,0,0))
ar_fit=ar_model.fit()
display('AR Model : ',ar_fit.summary())

ma_model=ARIMA(data3["Temp"],order=(0,0,2))
ma_fit=ma_model.fit()
display('MA Model : ',ma_fit.summary())

arma_model=ARIMA(data3["Temp"],order=(2,0,2))
arma_fit=arma_model.fit()
display('ARMA Model : ',arma_fit.summary())

In [None]:
#Histogram.
data3["Temp"].plot(kind="hist",bins=20,title="Histogram of Temperature",edgecolor="black")
plt.xlabel("Temperature")
plt.show()

#Density plot.
sns.kdeplot(data3["Temp"],fill=True)
plt.title("Density PLot of Temperature")
plt.show()

#box and wisker plot.
sns.boxplot(data3["Temp"])
plt.title("Box and Whisker Plot of Temperature")
plt.show()

#heatmap
data3["Lagged_1"]=data3["Temp"].shift(1)
data3["Lagged_3"]=data3["Temp"].shift(3)
data3["Rolling_mean_3"]=data3["Temp"].rolling(3).mean()
sns.heatmap(data3.select_dtypes(include=['number']).corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.show()

***Extra***

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

data_decomposed=seasonal_decompose(data3["Temp"],model="additive",period=365)

trend=data_decomposed.trend
seasonal=data_decomposed.seasonal
residual=data_decomposed.resid

plt.plot(data3["Temp"])
plt.title('Original')
plt.show()
plt.plot(trend)
plt.title("Trend")
plt.show()
plt.plot(seasonal)
plt.title("Seasonal")
plt.show()
plt.plot(residual)
plt.title("Residual")
plt.show()

# ***EX_4***

In [None]:
data4=pd.read_csv("data/shampoo_sales.csv")
display(data4.head())


data4["Date"]=pd.to_datetime(data4["Month"],format="%m-%y")
data4=data4.drop("Month",axis=1)
display(data4.head())

data=pd.Series(data4["Sales"].values,index=data4["Date"])
display(data.head())

In [None]:
#upsampling.

up_1=data.resample("h").mean()
up_2=data.resample("1min").min()
up_3=data.resample('5min').sum()
up_4=data.resample('h').asfreq()
display(up_1,up_1.describe(),"No of missing Values : ",up_1.isnull().sum())
display(up_2,up_2.describe(),"No of missing Values : ",up_2.isnull().sum())
display(up_3,up_3.describe(),"No of missing Values : ",up_3.isnull().sum())
display(up_4,up_4.describe(),"No of missing Values : ",up_4.isnull().sum())

In [None]:
#Downsampling from upsampled data.

down_1=up_1.resample("D").mean()
down_2=up_2.resample("10min").min()
down_3=up_3.resample("h").sum()
down_4=up_4.resample("D").asfreq()

display(down_1,down_1.describe(),"No of missing Values : ",down_1.isnull().sum())
display(down_2,down_2.describe(),"No of missing Values : ",down_2.isnull().sum())
display(down_3,down_3.describe(),"No of missing Values : ",down_3.isnull().sum())
display(down_4,down_4.describe(),"No of missing Values : ",down_4.isnull().sum())

In [None]:
#Interpolation .

d1=data.resample("D")
interpolated_1=d1.interpolate(method="linear")
display(interpolated_1.head())
display(interpolated_1.tail())
interpolated_1.plot(kind="line",title="Linear Interpolation")
plt.show()

interpolated_2=d1.interpolate(method="spline",order=2)
display(interpolated_2.head())
display(interpolated_2.tail())
interpolated_2.plot(title="Spline Interpolation")
plt.show()

# ***EX_5***

In [None]:
from statsmodels.tsa.stattools import kpss,adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
data5=pd.read_csv("data/daily-total-female-births.csv")
display(data5.head())

In [None]:
import warnings
warnings.filterwarnings('ignore')

class stationary_test():
    def adf_test(self,data):
      print("\nAdfuller : \n")
      statistic,p_value,n_lags,n_obs,critical_values,m_info=adfuller(data)
      print(f"Statistic : {statistic}")
      print(f"P_value : {p_value}")
      print(f"n_lags : {n_lags}")
      print(f"n_obs : {n_obs}")
      print(f"max_info : {m_info}")
      print("Critical Values : ")
      for key,value in critical_values.items():
        print(f"  {key} : {value}")
      print(f"Result : The data is {'not' if p_value<0.05 else ''} Stationary.")

    def kpss_test(self,data):
      print("\nKPSS : \n")
      statistic,p_value,n_lags,critical_values=kpss(data)
      print(f"Statistic : {statistic}")
      print(f"P_value : {p_value}")
      print(f"n_lags : {n_lags}")
      print("Critical Values : ")
      for key,value in critical_values.items():
        print(f"  {key} : {value}")
      print(f"Result : The data is {'not' if p_value<0.05 else ''} Stationary.")

stationary_test().adf_test(data5["Births"])
stationary_test().kpss_test(data5["Births"])

In [None]:
def decompose_data(data):
    decomposed_data=seasonal_decompose(data["Births"],model="additive",period=7)

    seasonal=decomposed_data.seasonal
    trend=decomposed_data.trend
    residual=decomposed_data.resid

    plt.subplot(411)
    plt.plot(data["Births"],label="Births")
    plt.title("Original")
    plt.legend(loc="best")
    plt.subplot(412)
    plt.plot(trend,label="Births")
    plt.title("Trend")
    plt.legend(loc="best")
    plt.subplot(413)
    plt.plot(seasonal,label="Births")
    plt.title("Seasonal")
    plt.legend(loc="best")
    plt.subplot(414)
    plt.plot(residual,label="Births")
    plt.title("Residual")
    plt.legend(loc="best")
    plt.tight_layout()
    plt.show()

decompose_data(data5)

# ***EX_6***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
df = pd.read_csv('data/daily-min-temperatures.csv')
print(df.shape)
df.head()

In [None]:
df.plot(title = "daily Minimum Temperature" ,figsize = (14, 8), legend = None, color = 'green')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.show()

In [None]:
fig, axs = plt.subplots(2, 1, figsize = (10,8))
plot_acf(df['Temp'], lags = 30, ax = axs[0], title = 'Autocorrelation (ACF)', color = 'green')
plot_pacf(df['Temp'], lags = 30, ax = axs[1], title = 'Partial Autocorrelation (PACF)', color = 'red')
plt.tight_layout()
plt.show()