# Data exploration using Non-parametric method


## Data Source: https://www.kaggle.com/datasets/camnugent/sandp500



## Data Description:
* Date - in format: yy-mm-dd
* Open - price of the stock at market open (this is NYSE data so all in USD)
* High - Highest price reached in the day
* Low Close - Lowest price reached in the day
* Volume - Number of shares traded
* Name - the stock's ticker name

# Importing the Python libraries

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


ModuleNotFoundError: No module named 'scipy'

# Loading the dataset runtime environment

In [None]:
df1=pd.read_csv("/content/AAPL_data.csv")
df1

: 

: 

In [None]:
df2=pd.read_csv("/content/ABC_data.csv")
df2

: 

## Displaying the Columns in dataset 1 and 2

In [None]:
print(f"Columns of data1 is: {df1.columns}")
print(f"Columns of data2 is:  {df2.columns}")

: 

# Information about both the datasets

In [None]:
print(f"Displaying infomation about the data1:  {df1.info()}")

: 

In [None]:
print(f"Displaying infomation about the data2:  {df2.info()}")

: 

## Checking for missing values for both the data-1 and data-2

In [None]:
df1.isnull().sum()

: 

In [None]:
df2.isnull().sum()

: 

# Univariate Analysis:
### For numerical variables:

### Calculating basic descriptive statistics

In [None]:
print(f"Description of data-1 is: \n {df1.describe()}")

: 

In [None]:
print(f"Description of data-2 is: \n {df2.describe()}")

: 

### Visualize the distribution using histograms, kernel density plots, or box plots.

In [None]:
# Set up the figure and axes
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(14, 12))
plt.subplots_adjust(hspace=0.4)

# Histograms
sns.histplot(df1['open'], ax=axes[0, 0], kde=True, bins=20, color='skyblue')
axes[0, 0].set_title('Histogram of Open Prices(data-1)')

sns.histplot(df1['close'], ax=axes[0, 1], kde=True, bins=20, color='orange')
axes[1, 1].set_title('Histogram of Close Prices((data-1))')

# Histograms
sns.histplot(df2['open'], ax=axes[1, 0], kde=True, bins=20, color='skyblue')
axes[0, 0].set_title('Histogram of Open Prices(data-2)')

sns.histplot(df2['close'], ax=axes[1, 1], kde=True, bins=20, color='orange')
axes[1, 1].set_title('Histogram of Close Prices(data-2)')

# # Box plots
# sns.boxplot(df1['volume'], ax=axes[2, 0], color='purple')
# axes[2, 0].set_title('Box Plot of Volume')

# Hide the empty subplot
#axes[2, 1].axis('off')

plt.show()


: 

### Box plot for Stock 1 and Stock 2

In [None]:
# Combine 'open' prices from df1 and df2 into a single DataFrame
combined_df = pd.concat([df1['open'], df2['open']], axis=1)
combined_df.columns = ['Open Prices - df1', 'Open Prices - df2']

# Plotting the box plot
plt.figure(figsize=(10, 6))
sns.boxplot(data=combined_df)
plt.title('Box Plot of Open Prices')
plt.ylabel('Open Price')
plt.xlabel('Dataframe')
plt.show()

: 

* prices of stock 1 and stock 2 over the period of 5 years

## 5. Bivariate Analysis:

* Explore relationships between pairs of numerical variables using scatter plots
or pair plots.

In [None]:
combined_df.columns = ['Open_prices_df1', 'Open_prices_df2']

# Plotting pair plot with adjusted layout
sns.set(style="ticks", color_codes=True)
g = sns.pairplot(combined_df)
plt.subplots_adjust(top=0.9)  # Adjust the top margin
g.fig.suptitle('Pair Plot of Open Prices from df1 and df2')
plt.show()

: 

## Calculating correlation coefficients between numerical variables.

In [None]:
num1=['open','close','volume']
num2=['open','close','volume']
# Calculate correlation coefficients for 'volume' index in df1
correlation_volume_df1 = df1[num1].corr()

# Calculate correlation coefficients for 'volume' index in df2
correlation_volume_df2 = df2[num2].corr()

print("Correlation Coefficient for 'Open', 'close' and 'volume' index in df1: \n", correlation_volume_df1)
print("\n")
print("Correlation Coefficient for 'Open', 'close' and 'volume' index in df2: \n", correlation_volume_df2)


: 

## Interpretation of Correlation coefficient
* 1.0: Perfect Positive Correlation
* -1.0: Perfect Negative Correlation
* 0: No correlation

## 6. Non-parametric Methods:

### Spearman rank correlation for assessing monotonic relationships between numerical variables.

In [None]:
from scipy.stats import spearmanr

# Calculate Spearman rank correlation coefficient for the 'volume' index in df1
spearman_corr_df1, p_value_df1 = spearmanr(df1['open'], df1['volume'])

# Calculate Spearman rank correlation coefficient for the 'volume' index in df2
spearman_corr_df2, p_value_df2 = spearmanr(df2['open'], df2['volume'])

print("Spearman rank correlation coefficient for df1:", spearman_corr_df1)
print("Spearman rank correlation coefficient for df2:", spearman_corr_df2)


: 

## Mann-Whitney U test

### Assumption:
* Null Hypothesis: There is no significant difference between the distributions
* Alternate Hypothesis: There is significant difference between the distributions

In [None]:
group1 = df1['open']
group2 = df2['open']

mann_whitney_test = stats.mannwhitneyu(group1, group2)
print("\nMann-Whitney U test result: ")
print("U statistic: ", mann_whitney_test.statistic)
print("p-value: ", mann_whitney_test.pvalue)

if mann_whitney_test.pvalue < 0.05:
    print("Mann-Whitney U test: Reject the null hypothesis, significant difference between the distributions.")
else:
    print("Mann-Whitney U test: Fail to reject the null hypothesis, no significant difference between the distributions.")


: 

## Wilcoxon signed-rank test for paired samples.

### Assumption:
* Null Hypothesis: There is no significant difference between the paired samples.
* Alternate Hypothesis: There is significant difference between the paired samples.

In [None]:
from scipy.stats import wilcoxon

data1 = df1['open']
data2 = df2['open']

# Perform the Wilcoxon signed-rank test
statistic, p_value = wilcoxon(data1, data2)

# Report results
print("Test statistic:", statistic)
print("p-value:", p_value)

# Interpret results
if p_value < 0.05:
    print("Wilcoxon signed-rank test: Reject the null hypothesis, significant difference between the paired samples.")
else:
    print("Wilcoxon signed-rank test: Fail to reject the null hypothesis, no significant difference between the paired samples.")



: 

## Friedman test for comparing multiple paired samples

In [None]:
# df3=pd.read_csv("/content/ABBV_data.csv")
# df4=pd.read_csv("/content/financials.csv")


: 

## Assumption:
* Null hypothesis: there is no statistically significant differences between the medians of the paired samples.
* Alternate hypotheis: there is statistically
significant differences between the medians of the paired samples.

In [None]:
from scipy.stats import friedmanchisquare

df3=pd.read_csv("/content/ABBV_data.csv")
indexes = ['open', 'close', 'volume']

# Create a list to store the data for each index
data = []

# Populate the data list with the values from df1 and df2 for each index
for index in indexes:
   data.append([df1[index], df2[index], df3[index]])

statistic, p_value = friedmanchisquare(*data)
print("Friedman test statistic:", statistic)
print("p-value:", p_value)

if p_value < 0.05:
    print("Friedman test: Reject the null hypothesis, significant difference between the medians of the paired samples.")
else:
    print("Friedman test: Fail to reject the null hypothesis, no significant difference between the median of the paired samples.")


: 

## Visualization

## Line Chart

In [None]:
# Extracting only the 'open' column from each dataframe
open_prices_df1 = df1['open']
open_prices_df2 = df2['open']

plt.figure(figsize=(10, 6))

# Plotting open prices from df1
plt.plot(open_prices_df1.index, open_prices_df1.values, label='Open Prices - Stock-1')

# Plotting open prices from df2
plt.plot(open_prices_df2.index, open_prices_df2.values, label='Open Prices - Stock-2')

# Adding labels and legend
plt.xlabel('Date')
plt.ylabel('Open Price')
plt.title('Open Prices Comparison')
plt.legend()

# Showing the plot
plt.show()

: 

## Histogram

In [None]:
# Histogram for 'volume'
plt.figure(figsize=(8, 6))
sns.histplot(df1['volume'], bins=30, kde=True)
plt.title('Volume Distribution')
plt.xlabel('Volume')
plt.ylabel('Frequency')
plt.show()

: 

In [None]:
# Histogram for 'volume'
plt.figure(figsize=(8, 6))
sns.histplot(df2['volume'], bins=30, kde=True)
plt.title('Volume Distribution')
plt.xlabel('Volume')
plt.ylabel('Frequency')
plt.show()

: 