Some typical NumPy 'random' functions:

In [None]:
import numpy as np

# To generate a random float values between 0 and 1 (exclusive)
print(np.random.rand(3, 2)) # Creates a 3 x 2 matrix of random float values between 0 and 1 (exclusive)

# To generate random float values from standard normal distribution (mean = 0 and std = 1)
print(np.random.randn(5)) # Generates 5 random float values from standard normal distribution

# To generate random integer values within a range of values
print(np.random.randint(1, 100, 10)) # Generates 10 random integer values between 1 and 100 (exclusive)

# To randomly select an element from a given list of elements
print(np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9], size=3)) # Three random numbers from the given array will be chosen

# To generate a random sample of values from a normally distributed with a specified mean and standard deviation
print(np.random.normal(loc=0, scale=1, size=10)) # 10 random values will be generated from a normally distributed data where 0 (loc) is the mean and 1 (scale) is the standard deviation

# To set a seed for a random number generation to ensure you get the same results later as well
print(np.random.seed(42)) # This ensures that any random numbers generated in the program are same every time you run the program. The seed value '42' can be any other value

AIM #1: Generate a very large dataset
1. Generate a dataset of 1 million random data items between 1 and 100 items using only pandas
2. Generate a dataset of 1 million random data items between 1 and 100 using only NumPy
3. Calculate the time it takes for both the above operations. 
    3.1. Import the 'time' module, and use the time() function to calculate current time
    3.2. Which one is faster and why?

In [None]:
# Write your code for AIM #1 here

# Using pandas
import pandas as pd
import numpy as np
import time

# Timing the Pandas approach
start_time_pandas = time.time()

# Generate a dataset of 1 million random integers between 1 and 100
pandas_data = pd.Series(np.random.randint(1, 101, size=1000000))

end_time_pandas = time.time()
pandas_time = end_time_pandas - start_time_pandas
print(f"Pandas time: {pandas_time:.6f} seconds")


# Using numpy
# Timing the NumPy approach
start_time_numpy = time.time()

# Generate a dataset of 1 million random integers between 1 and 100
numpy_data = np.random.randint(1, 101, size=1000000)

end_time_numpy = time.time()
numpy_time = end_time_numpy - start_time_numpy
print(f"NumPy time: {numpy_time:.6f} seconds")




# Compare the times
if pandas_time < numpy_time:
    print(f"Pandas is faster by {numpy_time - pandas_time:.6f} seconds")
else:
    print(f"NumPy is faster by {pandas_time - numpy_time:.6f} seconds")



#NumPy is generally faster than pandas for generating large arrays of random numbers. 
# This is because NumPy is optimized for numerical operations and utilizes contiguous memory blocks,
#  which allows for more efficient computations.



AIM #2: Basic statistics
For the given dataset on sleep health and lifestyle, do the following
1. Using only pandas, load the dataset, calculate mean 'Sleep Duration', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate' and 'Daily Steps'.
2. Do the same as in Step 1 using only NumPy
3. Using only pandas, first calculate correlation (across only the numerical variables), and then separate correlation between...
    Sleep duration and Age
    Sleep duration and Heart rate
    Sleep duration and Daily steps
4. Using only NumPy, do the same as Step 3
5. Using pandas only, calculate standard deviation for 'Sleep Duration'. 
6. Usiong NumPy only, calculate standard deviation for 'Sleep Duration'. 
7. Calculate the time difference between using pandas and NumPy, right from the step of loading the dataset to the final standard deviation step. 
    5.1. Which one is faster and why?

In [None]:
# Write your code for AIM #2 here

# using pandas
import pandas as pd
import numpy as np
import time

# Load the dataset using pandas
start_time_pandas = time.time()
df = pd.read_csv('sleep_health.csv')

# Calculate means for specified columns
mean_values = df[['Sleep Duration', 'Systolic Blood Pressure', 'Diastolic Blood Pressure', 'Heart Rate', 'Daily Steps']].mean()
print("Pandas Means:\n", mean_values)

# Calculate standard deviation for 'Sleep Duration'
std_sleep_duration_pandas = df['Sleep Duration'].std()
print(f"Pandas Standard Deviation for Sleep Duration: {std_sleep_duration_pandas}")

# Calculate correlations
correlation_matrix = df.corr()
sleep_duration_age = correlation_matrix.loc['Sleep Duration', 'Age']
sleep_duration_heart_rate = correlation_matrix.loc['Sleep Duration', 'Heart Rate']
sleep_duration_daily_steps = correlation_matrix.loc['Sleep Duration', 'Daily Steps']

print(f"Correlation between Sleep Duration and Age: {sleep_duration_age}")
print(f"Correlation between Sleep Duration and Heart Rate: {sleep_duration_heart_rate}")
print(f"Correlation between Sleep Duration and Daily Steps: {sleep_duration_daily_steps}")

end_time_pandas = time.time()
pandas_time = end_time_pandas - start_time_pandas


# using numpy
# Load the dataset using NumPy
start_time_numpy = time.time()
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=None, encoding='utf-8', names=True)

# Calculate means for specified columns
mean_sleep_duration = np.mean(data['Sleep Duration'])
mean_systolic_bp = np.mean(data['Systolic Blood Pressure'])
mean_diastolic_bp = np.mean(data['Diastolic Blood Pressure'])
mean_heart_rate = np.mean(data['Heart Rate'])
mean_daily_steps = np.mean(data['Daily Steps'])

print("NumPy Means:")
print(f"Sleep Duration: {mean_sleep_duration}")
print(f"Systolic Blood Pressure: {mean_systolic_bp}")
print(f"Diastolic Blood Pressure: {mean_diastolic_bp}")
print(f"Heart Rate: {mean_heart_rate}")
print(f"Daily Steps: {mean_daily_steps}")

# Calculate standard deviation for 'Sleep Duration'
std_sleep_duration_numpy = np.std(data['Sleep Duration'], ddof=1)  # Sample standard deviation
print(f"NumPy Standard Deviation for Sleep Duration: {std_sleep_duration_numpy}")

# Calculate correlations using np.corrcoef
correlation_matrix_numpy = np.corrcoef(data[['Sleep Duration', 'Age', 'Heart Rate', 'Daily Steps']].T)
sleep_duration_age_numpy = correlation_matrix_numpy[0, 1]
sleep_duration_heart_rate_numpy = correlation_matrix_numpy[0, 2]
sleep_duration_daily_steps_numpy = correlation_matrix_numpy[0, 3]

print(f"Correlation between Sleep Duration and Age: {sleep_duration_age_numpy}")
print(f"Correlation between Sleep Duration and Heart Rate: {sleep_duration_heart_rate_numpy}")
print(f"Correlation between Sleep Duration and Daily Steps: {sleep_duration_daily_steps_numpy}")

end_time_numpy = time.time()
numpy_time = end_time_numpy - start_time_numpy



# Compare the times
print(f"Pandas time: {pandas_time:.6f} seconds")
print(f"NumPy time: {numpy_time:.6f} seconds")

if pandas_time < numpy_time:
    print(f"Pandas is faster by {numpy_time - pandas_time:.6f} seconds")
else:
    print(f"NumPy is faster by {pandas_time - numpy_time:.6f} seconds")


#NumPy is often faster for numerical calculations due to its optimized functions that work directly on arrays.
#Pandas adds overhead for handling DataFrame operations, which might slow down certain calculations compared to NumPy.




AIM #3: Use suitable plots to visualize the data

1. Using only pandas (and matplotlib/seaborn if necessary) plot the distribution for
    1.1. Age
    1.2. Sleep Duration
    1.3. Quality of Sleep
    1.4. Physical Activity Level
    1.5. Stress Level
    1.6. Heart Rate
2. Using only NumPy, do the same as Step 1. You will need matplotlib for this
3. Using only pandas, use the appropriate plot to
    3.1. See the distribution of 'Sleep Duration' based on 'Quality of Sleep'
    3.2. See the distribution of 'Sleep Duration' based on 'Stress Level'
    3.3. See the distribution of 'Sleep Duration' based on 'Physical Activity Level'
    3.4. See the distribution of 'Sleep Duration' based on 'Occupation'
    3.5. See the distribution of 'Sleep Duration' based on 'BMI'
4. Using only NumPy, do the same as Step 3. You will need matplotlib for this
5. Using only pandas, use a suitable plot to see the relation between
    5.1. Age and Sleep Duration
    5.2. Sleep Duration and Heart Rate
    5.3. Heart Rate and Daily Steps
    5.4. Sleep Duration and Daily Steps
6. Using only NumPy, do the same as Step 5. You will need matplotlib for this 
7. Find the time difference between plotting using only pandas, and plotting using NumPy

In [None]:
# Write your code for AIM #3 here

# Using Pandas
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Load the dataset
df = pd.read_csv('sleep_health.csv')

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# 1.1 Age distribution
plt.subplot(3, 2, 1)
sns.histplot(df['Age'], bins=30, kde=True)
plt.title('Age Distribution')

# 1.2 Sleep Duration distribution
plt.subplot(3, 2, 2)
sns.histplot(df['Sleep Duration'], bins=30, kde=True)
plt.title('Sleep Duration Distribution')

# 1.3 Quality of Sleep distribution
plt.subplot(3, 2, 3)
sns.histplot(df['Quality of Sleep'], bins=30, kde=True)
plt.title('Quality of Sleep Distribution')

# 1.4 Physical Activity Level distribution
plt.subplot(3, 2, 4)
sns.histplot(df['Physical Activity Level'], bins=30, kde=True)
plt.title('Physical Activity Level Distribution')

# 1.5 Stress Level distribution
plt.subplot(3, 2, 5)
sns.histplot(df['Stress Level'], bins=30, kde=True)
plt.title('Stress Level Distribution')

# 1.6 Heart Rate distribution
plt.subplot(3, 2, 6)
sns.histplot(df['Heart Rate'], bins=30, kde=True)
plt.title('Heart Rate Distribution')

plt.tight_layout()
plt.show()


# Using NumPy
# Load the dataset using NumPy
data = np.genfromtxt('sleep_health.csv', delimiter=',', skip_header=1, dtype=None, encoding='utf-8', names=True)

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# 1.1 Age distribution
plt.subplot(3, 2, 1)
plt.hist(data['Age'], bins=30, alpha=0.7)
plt.title('Age Distribution')

# 1.2 Sleep Duration distribution
plt.subplot(3, 2, 2)
plt.hist(data['Sleep Duration'], bins=30, alpha=0.7)
plt.title('Sleep Duration Distribution')

# 1.3 Quality of Sleep distribution
plt.subplot(3, 2, 3)
plt.hist(data['Quality of Sleep'], bins=30, alpha=0.7)
plt.title('Quality of Sleep Distribution')

# 1.4 Physical Activity Level distribution
plt.subplot(3, 2, 4)
plt.hist(data['Physical Activity Level'], bins=30, alpha=0.7)
plt.title('Physical Activity Level Distribution')

# 1.5 Stress Level distribution
plt.subplot(3, 2, 5)
plt.hist(data['Stress Level'], bins=30, alpha=0.7)
plt.title('Stress Level Distribution')

# 1.6 Heart Rate distribution
plt.subplot(3, 2, 6)
plt.hist(data['Heart Rate'], bins=30, alpha=0.7)
plt.title('Heart Rate Distribution')

plt.tight_layout()
plt.show()



# Using Pandas for Conditional Distributions
# Set up the matplotlib figure for conditional distributions
plt.figure(figsize=(12, 10))

# 3.1 Sleep Duration by Quality of Sleep
plt.subplot(3, 2, 1)
sns.boxplot(x='Quality of Sleep', y='Sleep Duration', data=df)
plt.title('Sleep Duration by Quality of Sleep')

# 3.2 Sleep Duration by Stress Level
plt.subplot(3, 2, 2)
sns.boxplot(x='Stress Level', y='Sleep Duration', data=df)
plt.title('Sleep Duration by Stress Level')

# 3.3 Sleep Duration by Physical Activity Level
plt.subplot(3, 2, 3)
sns.boxplot(x='Physical Activity Level', y='Sleep Duration', data=df)
plt.title('Sleep Duration by Physical Activity Level')

# 3.4 Sleep Duration by Occupation
plt.subplot(3, 2, 4)
sns.boxplot(x='Occupation', y='Sleep Duration', data=df)
plt.title('Sleep Duration by Occupation')

# 3.5 Sleep Duration by BMI
plt.subplot(3, 2, 5)
sns.boxplot(x='BMI', y='Sleep Duration', data=df)
plt.title('Sleep Duration by BMI')

plt.tight_layout()
plt.show()



# Using NumPy for Conditional Distributions
# Set up the matplotlib figure for conditional distributions
plt.figure(figsize=(12, 10))

# 3.1 Sleep Duration by Quality of Sleep
unique_quality = np.unique(data['Quality of Sleep'])
sleep_duration_quality = [data['Sleep Duration'][data['Quality of Sleep'] == q] for q in unique_quality]
plt.subplot(3, 2, 1)
plt.boxplot(sleep_duration_quality, labels=unique_quality)
plt.title('Sleep Duration by Quality of Sleep')

# 3.2 Sleep Duration by Stress Level
unique_stress = np.unique(data['Stress Level'])
sleep_duration_stress = [data['Sleep Duration'][data['Stress Level'] == s] for s in unique_stress]
plt.subplot(3, 2, 2)
plt.boxplot(sleep_duration_stress, labels=unique_stress)
plt.title('Sleep Duration by Stress Level')

# 3.3 Sleep Duration by Physical Activity Level
unique_activity = np.unique(data['Physical Activity Level'])
sleep_duration_activity = [data['Sleep Duration'][data['Physical Activity Level'] == a] for a in unique_activity]
plt.subplot(3, 2, 3)
plt.boxplot(sleep_duration_activity, labels=unique_activity)
plt.title('Sleep Duration by Physical Activity Level')

# 3.4 Sleep Duration by Occupation
unique_occupation = np.unique(data['Occupation'])
sleep_duration_occupation = [data['Sleep Duration'][data['Occupation'] == o] for o in unique_occupation]
plt.subplot(3, 2, 4)
plt.boxplot(sleep_duration_occupation, labels=unique_occupation)
plt.title('Sleep Duration by Occupation')

# 3.5 Sleep Duration by BMI
unique_bmi = np.unique(data['BMI'])
sleep_duration_bmi = [data['Sleep Duration'][data['BMI'] == b] for b in unique_bmi]
plt.subplot(3, 2, 5)
plt.boxplot(sleep_duration_bmi, labels=unique_bmi)
plt.title('Sleep Duration by BMI')

plt.tight_layout()
plt.show()



# Using Pandas for Relationship Plots
# Set up the matplotlib figure for relationship plots
plt.figure(figsize=(12, 10))

# 5.1 Age and Sleep Duration
plt.subplot(3, 2, 1)
sns.scatterplot(x='Age', y='Sleep Duration', data=df)
plt.title('Age vs Sleep Duration')

# 5.2 Sleep Duration and Heart Rate
plt.subplot(3, 2, 2)
sns.scatterplot(x='Sleep Duration', y='Heart Rate', data=df)
plt.title('Sleep Duration vs Heart Rate')

# 5.3 Heart Rate and Daily Steps
plt.subplot(3, 2, 3)
sns.scatterplot(x='Heart Rate', y='Daily Steps', data=df)
plt.title('Heart Rate vs Daily Steps')

# 5.4 Sleep Duration and Daily Steps
plt.subplot(3, 2, 4)
sns.scatterplot(x='Sleep Duration', y='Daily Steps', data=df)
plt.title('Sleep Duration vs Daily Steps')

plt.tight_layout()
plt.show()



# Using NumPy for Relationship Plots
# Set up the matplotlib figure for relationship plots
plt.figure(figsize=(12, 10))

# 5.1 Age and Sleep Duration
plt.subplot(3, 2, 1)
plt.scatter(data['Age'], data['Sleep Duration'])
plt.title('Age vs Sleep Duration')

# 5.2 Sleep Duration and Heart Rate
plt.subplot(3, 2, 2)
plt.scatter(data['Sleep Duration'], data['Heart Rate'])
plt.title('Sleep Duration vs Heart Rate')

# 5.3 Heart Rate and Daily Steps
plt.subplot(3, 2, 3)
plt.scatter(data['Heart Rate'], data['Daily Steps'])
plt.title('Heart Rate vs Daily Steps')

# 5.4 Sleep Duration and Daily Steps
plt.subplot(3, 2, 4)
plt.scatter(data['Sleep Duration'], data['Daily Steps'])
plt.title('Sleep Duration vs Daily Steps')

plt.tight_layout()
plt.show()



# Time for Pandas plots
start_time_pandas = time.time()
# (Include all previous pandas plotting code here)
end_time_pandas = time.time()
pandas_plot_time = end_time_pandas - start_time_pandas

# Time for NumPy plots
start_time_numpy = time.time()
# (Include all previous NumPy plotting code here)
end_time_numpy = time.time()
numpy_plot_time = end_time_numpy - start_time_numpy

# Compare times
print(f"Pandas plot time: {pandas_plot_time:.6f} seconds")
print(f"NumPy plot time: {numpy_plot_time:.6f} seconds")

if pandas_plot_time < numpy_plot_time:
    print(f"Pandas is faster by {numpy_plot_time - pandas_plot_time:.6f} seconds")
else:
    print(f"NumPy is faster by {pandas_plot_time - numpy_plot_time:.6f} seconds")




AIM #4: Other possible plotting

1. Think of other possible plots to show some interesting distribution and relations. Do this using both pandas and NumPy

