## Statistics For Data Science
[Link](https://www.geeksforgeeks.org/data-science/statistics-for-data-science/)

In [0]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
np.random.seed(42)

# 1. Generate the two arrays
arr1 = np.random.randint(0, 11, size=100)
arr2 = np.random.randint(0, 6, size=100)

# 2. Add the arrays (element-wise addition)
combined_arr = np.concatenate((arr1, arr2))

# 3. Calculate Statistics
mean = np.mean(combined_arr)           # Average
median = np.median(combined_arr)       # Middle value (Q2)
mode = stats.mode(combined_arr, keepdims=True).mode[0]  # Most frequent

# Quartiles and Range
q1 = np.percentile(combined_arr, 25)   # 25th percentile
q2 = np.percentile(combined_arr, 50)   # Same as median
q3 = np.percentile(combined_arr, 75)   # 75th percentile
iqr = q3 - q1                          # Interquartile Range
data_range = np.ptp(combined_arr)      # Range (Peak-to-Peak)

# Dispersion
std_dev = np.std(combined_arr)         # Standard Deviation
variance = np.var(combined_arr)        # Variance

# 4. Print Results
print(f"Mean: {mean:.2f}, Median: {median}, Mode: {mode}")
print(f"Q1: {q1}, Q2: {q2}, Q3: {q3}, IQR: {iqr}, Range: {data_range}")
print(f"Std Dev: {std_dev:.2f}, Variance: {variance:.2f}")

# 5. Visualizations
plt.figure(figsize=(12, 5))

# Subplot 1: Histogram of arr1 (Range 0 to 10)
plt.subplot(1, 2, 1)
# Bins should cover the range, e.g., 0 to 10. We use 11 to include both ends nicely.
plt.hist(combined_arr, bins=np.arange(0, 12) - 0.5, color='skyblue', edgecolor='black', alpha=0.7)
plt.title("Distribution of combined_arr (0 to 10)")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.xticks(range(0, 11))

# Plot 2: Box Plot
plt.subplot(1, 2, 2)
plt.boxplot(combined_arr)
plt.title("Box Plot of Combined Data")
plt.ylabel("Values")

plt.tight_layout()
plt.show()


In [0]:
# Subplot 1: Histogram of arr1 (Range 0 to 10)
plt.subplot(1, 2, 1)
# Bins should cover the range, e.g., 0 to 10. We use 11 to include both ends nicely.
plt.hist(arr1, bins=np.arange(0, 12) - 0.5, color='skyblue', edgecolor='black', alpha=0.7)
plt.title("Distribution of arr1 (0 to 10)")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.xticks(range(0, 11))

# Subplot 2: Histogram of arr2 (Range 0 to 5)
plt.subplot(1, 2, 2)
# Bins should cover the range, e.g., 0 to 5. We use 6 to include both ends nicely.
plt.hist(arr2, bins=np.arange(0, 6) - 0.5, color='lightgreen', edgecolor='black', alpha=0.7)
plt.title("Distribution of arr2 (0 to 5)")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.xticks(range(0, 6))

plt.tight_layout()
plt.show()

In [0]:
# 1. Calculate Value Counts and sort by the value itself (index)
counts_series = pd.Series(combined_arr).value_counts().sort_index()

# 2. Convert the Series to a DataFrame
df_summary = pd.DataFrame({
    'Value': counts_series.index,
    'Counts': counts_series.values
})

# 3. Calculate Cumulative Sum of Values
df_summary['Total Value Occurred'] = df_summary['Value'] * df_summary['Counts']
df_summary['Cumulative Sum of Total Value'] = df_summary['Total Value Occurred'].cumsum()

# 4. Calculate Cumulative Sum of Counts
df_summary['Cumulative Sum of Counts'] = df_summary['Counts'].cumsum()

df_summary.rename(columns={'Counts': 'Counts (Frequency)'}, inplace=True)

# 5. Display the final DataFrame
df_summary

![image_1769444943966.png](./image_1769444943966.png "image_1769444943966.png")

In [0]:
# given a series, calculate Range, IQR, Q1, Median, Q3, lf, uf, outliers
# also give position of Q1, Q2, Q3
def get_range_iqr_outliers(series):
  n = len(series) # +1 ?
  q1_position = int(n * 0.25)
  q2_position = int(n * 0.5)
  q3_position = int(n * 0.75)
  q1 = np.percentile(series, 25)
  q2 = np.percentile(series, 50)
  q3 = np.percentile(series, 75)
  iqr = q3 - q1
  lf = q1 - 1.5 * iqr
  uf = q3 + 1.5 * iqr
  outliers = series[(series < lf) | (series > uf)]

  # plot boxplot for the same
  plt.boxplot(series)
  plt.title("Box Plot")
  plt.ylabel("Values")
  plt.show()

  return q1, q2, q3, iqr, lf, uf, outliers

  

In [0]:
series_1 = pd.Series([2,2,4,4,6,6,8,10,50])
get_range_iqr_outliers(series_1)