# Insights of Nifty 50 Index

In [None]:
#Loading file data
import pandas as pd
import numpy as np
df_data=pd.read_csv("/kaggle/input/nifty-indices-dataset/NIFTY 50.csv")
df_data.head()


In [None]:
df_data.columns

In [None]:
#Data types of the Features
df_data.dtypes

In [None]:
#Identifying the missing data
missing_data = df_data.isnull()
missing_data.head(5)


In [None]:
#Identifying the number of missing data in all columns 
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("") 

True means data is missing and false means data is not missing.
So data is missing in Volume and Turnover
.We will deal with the missing data by taking the average value of both th columns

In [None]:
#Averaging the volume
avg_vol = df_data["Volume"].astype("float").mean(axis=0)
print("Average Volume:", avg_vol)

In [None]:
#Averaging the Turnover
avg_tur = df_data["Turnover"].astype("float").mean(axis=0)
print("Average Turnover:", avg_tur)

Replacing the missing values by taking the mean of the particular column

In [None]:
#Replacing the missing or nan value by the average value in volume column
df_data["Volume"].replace(np.nan, avg_vol, inplace=True)

In [None]:
#Replacing the missing or nan value by the average value in turnover column
df_data["Turnover"].replace(np.nan, avg_tur, inplace=True)

In [None]:
#Rechecking the missing values in the columns
missing_data = df_data.isnull()
missing_data.head(5) 

In [None]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("") 
    
#So there is no missing values now

In [None]:
#converting the data type of datetime from object to timestamp
df_data["Date"]= pd.to_datetime(df_data["Date"])
df_data.dtypes

Adding column month for monthwise analysis And column year for yearwise analysis

In [None]:
df_data['month'] = df_data['Date'].dt.month
df_data['year'] = df_data['Date'].dt.year

In [None]:
df_data.head()

In [None]:
df_data.info()

In [None]:
df_data.describe()

Now lets view the correlation between all the columns

In [None]:
df_data.corr()

Now let's start visualizing all the data

In [None]:
# Importing all the Vizualization libraries
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot
import matplotlib.pyplot as plt


In [None]:
df_data[["Volume", "Turnover"]].corr()

We can view a positive relation between volume and turnover.
So we can view this in regression relation plot

In [None]:
sns.regplot(x="Volume", y="Turnover",color='green', data=df_data,truncate=False)
plt.ylim(0,)

In [None]:
df_data[["Volume", "Div Yield"]].corr()

We can view a negative relation between volume and Div Yield

In [None]:
sns.regplot(x="Volume", y="Div Yield", color='r',data=df_data,truncate=False)
plt.ylim(0,)

In [None]:
df_data[["Turnover", "Div Yield"]].corr()

The correlation between Turnover and Div Yield also shows a negative value

In [None]:
sns.regplot(x="Turnover", y="Div Yield",data=df_data,truncate=False)
plt.ylim(0,)

Now lets see how does the data varies monthly 

In [None]:
sns.relplot(x="month", y="Volume", kind="line", data=df_data)

The above plot shows that during the start of the year the volume shows a linear increase and falls mid-year and then again gains high volume

In [None]:
sns.relplot(x="month", y="Turnover", kind="line", data=df_data)

The above plot show the turnover variation over the period of year

Now lets visualize the data over the year from 2000-2020

In [None]:
sns.set_style("darkgrid")
plt.figure(figsize=(15,6))
sns.barplot(x='year', y='Volume', data=df_data)

In [None]:
plt.figure(figsize=(15,6))
df= sns.barplot(x="year", y="Turnover", data=df_data,ci="sd")

In [None]:
plt.figure(figsize=(18,5))
sns.pointplot(x='year', y='Div Yield', data=df_data)

In [None]:
plt.figure(figsize=(15,6))
df = sns.boxplot(x="year", y="P/E", data=df_data)

In [None]:
plt.figure(figsize=(15,6))
df = sns.boxplot(x="year", y="P/B", data=df_data)

Let's visualize the Open,High,low,Close (OHLC) data 

In [None]:
df_ohlc=df_data[['Date','Open', 'High','Low', 'Close']]
df_ohlc.head()

In [None]:
import seaborn as sns; sns.set(style="ticks", color_codes=True)
df = sns.pairplot(df_ohlc, corner=True)

In [None]:
df_ohlc.corr()

This shows that there is very close and linear relation between all the four features (OHLC)

In [None]:
df = sns.jointplot(x="High", y="Low", data=df_data,kind='hex',height=5, ratio=3)

Since all of them shows a linear and close correlation we will just plot High price data over the period of years (2000-2020)

In [None]:
sns.set_style("darkgrid")
plt.figure(figsize=(18,5))
sns.pointplot(x='year', y='High', data=df_data)