# Data Analysis

### 1. Load the Data from clean CSV

In [None]:
# Import pandas to extract the data
import pandas as pd

# Import numpy to manipulate the data
import numpy as np

# Import matplotlib for visual analysis
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mpl_dates

# Import data from csv
df = pd.read_csv('clean_df.csv', index_col=0, parse_dates=True, dayfirst=True)

# Review the first 5 rows of the dataframe
df.head()

### 2. Descriptive Statistical analysis

In [None]:
# Review quick snapshot of data to ensure everything has loaded as expected
df.info()

In [None]:
# Review basic statistics for the weight column
df['weight'].describe()

In [None]:
# Review count of weight column values
df['weight'].value_counts()

### 3. Plot weigh trend over time

In [None]:
# Plot weight trend over time

# Reset plot style
#lt.rcParams.update(plt.rcParamsDefault)

# Define plot style to use
plt.style.use('seaborn-v0_8-whitegrid')

# Use the weight column of the dataframe to plot
plt.plot_date(df.index, df.weight, linestyle='solid') # Specify plot type as line
                  #figsize=(10,5), # Specify the height and width
                  #marker='o') # Add marker for the value points
                  #color='darkblue', # Specify color for the line
                  #linewidth='1.5', # Specify width for the line
                  #label='Weight (lbs)') # Add label

# Set axis range for the Y axis
plt.ylim(158.0, 168.0)

# Define date format as day, month, year
date_format = mpl_dates.DateFormatter('%d %b %Y')

# Apply date format to the X axis
plt.gca().xaxis.set_major_formatter(date_format)

# Rotate the ticks on the X axis so dates are easier to read
plt.xticks(rotation=30)

# Add plot title and axis labels for context
plt.title('Weight Trend: 1st Nov 2023-4th Apr 2024')
plt.ylabel('Weight (lbs)')
plt.xlabel('Dates')

# Add annotation for start date
plt.text(df.index[1], 167.3, ' 1st Nov: Start')

# Enable tight laytout so padding is consistent
plt.tight_layout()

# Display plot
plt.show()

In [None]:
# # Build a bin array with a minimum value to a maximum value with 5 dividers for 4 bins
# bins = np.linspace(min(df["weight"]), max(df["weight"]), 5)

# # View bins
# bins

In [None]:
# # Determine what bin each value belongs to
# df['weight-binned'] = pd.cut(df['weight'], bins, include_lowest=True)

# # Retrieve the first 20 values
# df[['weight','weight-binned']].head(20)

In [None]:
# See the number of vehicles in each bin
# df["weight-binned"].value_counts()

In [None]:
# step = 50
# bin_range = np.arange(-200, 1000+step, step)
# out, bins  = pd.cut(step, bins=bin_range, include_lowest=True, right=False, retbins=True)
# out.value_counts().plot.bar()

In [None]:
# Plot weight frequency
plt.hist(df.weight,
         bins=np.arange(min(df.weight), max(df.weight), 1),
         linewidth = 2,
         edgecolor= "white")

# Set axis range for the X axis
plt.xlim(min(df.weight), max(df.weight))

plt.xticks(np.linspace(min(df.weight), max(df.weight), num=5))

# Add plot title and axis labels for context
plt.title('Weight Frequency: 1st Nov 2023-4th Apr 2024')
plt.xlabel('Weight (lbs)')
plt.ylabel('Frequency')

# Display plot
plt.show()

### 4. Create Histograms for each month

In [None]:
# Get mean weight for each month
df.weight.resample('D').mean().plot()

In [None]:
# Review basic statistics for object variables
df.describe(include=['object'])

In [None]:
# Create a scatter plot of CPU_frequency and price columns
sns.regplot(x="CPU_frequency", y="Price", data=df)
plt.ylim(0,)

In [None]:
# Find the correlation between CPU_frequency and price columns
df[["CPU_frequency", "Price"]].corr()

In [None]:
# Create a box plot to look at the relationship between category and price
sns.boxplot(x="Category", y="Price", data=df)

In [None]:
# Create dataframe using weight column
df_weight = df[['weight']]
df_weight