In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Box Plot

* Box: The box represents the interquartile range (IQR), which is the range between the first quartile (Q1 or 25th percentile) and the third quartile (Q3 or 75th percentile). The box's length depicts the spread of the middle 50% of the data.


* Median Line: A line inside the box represents the median (Q2 or 50th percentile), which divides the data into two equal halves.


* Whiskers: The lines extending from the box are called whiskers. They represent the minimum and maximum values within a certain range, typically 1.5 times the IQR from the box edges. Any data points beyond the whiskers are considered outliers.


* Outliers: Outliers are data points that lie an abnormal distance from the rest of the data. They are usually plotted as individual points beyond the whiskers.

In [None]:
data

In [None]:
# Generate random data
data = np.random.normal(0, 1, 1000)

# Create a box plot
plt.figure(figsize=(8, 6))
plt.boxplot(data)
plt.title("Box Plot Example")
plt.show()

# Interpreting Box Plots
The box plot provides a visual summary of the data's distribution, including:
* Center: The median line represents the center of the data.
* Spread: The box's length indicates the spread of the middle 50% of the data.
* Skewness: If the median line is not centered within the box, the data may be skewed.
* Outliers: Outliers are easily identifiable as individual points beyond the whiskers.

In [None]:
# Generate data for multiple groups
group1 = np.random.normal(0, 1, 100)
group2 = np.random.normal(2, 1, 100)
group3 = np.random.normal(-1, 1, 100)

# Create a box plot with multiple groups
plt.figure(figsize=(10, 6))
plt.boxplot([group1, group2, group3], patch_artist=True, labels=["Group 1", "Group 2", "Group 3"])
plt.title("Box Plot with Multiple Groups")
plt.show()

In [None]:
# Descriptive Statistics


# Sample data
data = {
    'values': [12, 15, 14, 10, 8, 10, 9, 7, 13, 12, 16, 14, 15, 14, 12, 10, 9, 10, 15, 13]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Calculate Measures of Central Tendency
mean_value = df['values'].mean()
median_value = df['values'].median()
mode_value = df['values'].mode()[0]

# Calculate Measures of Variability
range_value = df['values'].max() - df['values'].min()
variance_value = df['values'].var()
std_deviation_value = df['values'].std()



# Print the results
print(f"Mean: {mean_value}")
print(f"Median: {median_value}")
print(f"Mode: {mode_value}")
print(f"Range: {range_value}")
print(f"Variance: {variance_value}")
print(f"Standard Deviation: {std_deviation_value}")



We can calculate the standard deviation manually using the formula:
![image.png](attachment:image.png)

Measuring variability: The primary use of standard deviation is to quantify the amount of variation or dispersion present in a dataset. A low standard deviation indicates that the data points tend to be clustered closely around the mean, while a high standard deviation suggests that the data is more spread out over a wider range of values.

# How to use Merge in Pandas

In this lab, we'll learn how to merge two data frames together using different types of joins. We'll use a sample dataset of employees and their departments.

https://pandas.pydata.org/docs/reference/api/pandas.merge.html

![image.png](attachment:image.png)

In [None]:
# Create sample data frames
employees = pd.DataFrame({'EmpID': [1, 2, 3, 4, 5],
                           'Name': ['John', 'Jane', 'Bob', 'Alice', 'Mike'],
                           'Age': [32, 28, 45, 37, 51]})

departments = pd.DataFrame({'DeptID': [1, 2, 3, 4, 10],
                             'DeptName': ['Sales', 'IT', 'HR', 'Finance', 'Boss'],
                             'EmpID': [1, 3, 2, 4,6]})

print('Employees:')
print(employees)
print('\nDepartments:')
print(departments)

# Inner Join
An inner join returns a new data frame containing only the rows where the merge key values exist in both data frames.

In [None]:
# Perform an inner join
merged_inner = pd.merge(employees, departments, left_on='EmpID', right_on='EmpID', how='inner')
print('\nInner Join:')
print(merged_inner)

# Left Join
A left join returns a new data frame containing all rows from the left data frame and only the matching rows from the right data frame. Non-matching rows in the right data frame will have NaN values.

In [None]:
# Perform a left join
merged_left = pd.merge(employees, departments, left_on='EmpID', right_on='EmpID', how='left')
print('\nLeft Join:')
print(merged_left)

# Right Join
A right join is the opposite of a left join. It returns a new data frame containing all rows from the right data frame and only the matching rows from the left data frame. Non-matching rows in the left data frame will have NaN values.

In [None]:
# Perform a right join
merged_right = pd.merge(employees, departments, left_on='EmpID', right_on='EmpID', how='right')
print('\nRight Join:')
print(merged_right)

# Outer Join
An outer join returns a new data frame containing all rows from both data frames, combining rows with matching merge key values and filling in NaN for non-matching rows.

In [None]:
# Perform an outer join
merged_outer = pd.merge(employees, departments, left_on='EmpID', right_on='EmpID', how='outer')
print('\nOuter Join:')
print(merged_outer)

# Lab

In [None]:
df_reviews = pd.read_csv('../data/reviews.csv')
df_submissions = pd.read_csv('../data/Submissions.csv')

df_combined = pd.read_csv('https://raw.githubusercontent.com/raymondEDS/VRM-E/main/data/raw/df_submission_rating.csv')

In [None]:
df_reviews.head()

In [None]:
df_average_review_score = df_reviews.groupby('forum')['rating_int'].mean()
df_average_review_score

In [None]:
df_average_review_score = df_average_review_score.reset_index()
df_average_review_score

In [None]:
df_submission_score = pd.merge(df_submissions, df_average_review_score, left_on='id', right_on='forum', how = 'inner')
df_submission_score

In [None]:
df_new = df_submission_score.groupby('conf_year')['rating_int'].mean().reset_index()

In [None]:
df_new

In [None]:
df_new.plot.scatter(x='conf_year', y ='rating_int')

# Brainstorm:

What are some questions for the data that we can look through some suggestions:
* Number of submissions by year
* Length of review by year
* Reviewer confidence by year
* Topics by year