# 1. Welcome to the Ultimate Pandas Introduction

In [None]:
# Importing pandas and numpy libraries
import pandas as pd
import numpy as np


## Series

In [None]:
# Define the data
mydata = ['Boat', 'Car', 'Bike', 'Truck']

# Create a pandas Series
myseries1 = pd.Series(mydata)

# Print the Series
print(myseries1)


In [None]:
# Define the data
mydata = [1, 55, 99, 43]

# Create a pandas Series
myseries2 = pd.Series(mydata)

# Print the Series
print(myseries2)


## DataFrame

In [None]:
# Define the data
mydata = [('Boat', 1), ('Car', 55), ('Bike', 99), ('Truck', 43)]

# Create a DataFrame
mydf = pd.DataFrame(mydata, columns=['Thing', 'Count'])

# Print the DataFrame
print(mydf)


In [None]:
# Access the 'Thing' column of the DataFrame
mydf['Thing']

In [None]:
# Check the type of the 'Thing' column
type(mydf['Thing'])

# Reading in Data

In [None]:
# Read the CSV file into a DataFrame
df = pd.read_csv('/kaggle/input/mr-beast-youtube-video-statistics/MrBeast_youtube_stats.csv')


# Inspect The Data

In [None]:
# Display the first few rows of the DataFrame
df.head()


In [None]:
# Display the last few rows of the DataFrame
df.tail()


In [None]:
# Reset the index of the DataFrame and display the data types of each column
df.dtypes.reset_index()

In [None]:
# Display descriptive statistics for the DataFrame
df.describe()

# Columns and Rows

In [None]:
# Accessing the 'viewCount' column in the DataFrame df
df['viewCount']

In [None]:
# Displaying the first few rows of the 'viewCount' column
df['viewCount'].head()

In [None]:
# Accessing the row with index 5 in the DataFrame df
df.loc[5]


In [None]:
# Setting the 'id' column as the index of the DataFrame df
df.set_index('id', inplace=True)

In [None]:
# Retrieving the row with index 'cExLQ1o2pDw' from the DataFrame df
df.loc['cExLQ1o2pDw']


# Subsetting Data

### Subsetting Columns

In [None]:
# Reassigning df with selected columns
df = df[['title', 'description', 'publishTime', 'duration_seconds', 'viewCount', 'likeCount', 'commentCount']]

In [None]:
# Displaying the first few rows of the DataFrame
df.head()

In [None]:
# Displaying the shape of the DataFrame
df.shape

## Subsetting Rows


In [None]:
# Subsetting rows using loc based on a condition
df_subset1 = df.loc[df['viewCount'] > 1_000_000]

## Subsetting using Query

In [None]:
# Subsetting rows using query based on a condition
df_subset2 = df.query('viewCount > 1000000')

In [None]:
# Counting NaN values in the 'viewCount' column
df['viewCount'].isna().sum()

In [None]:
# Removing rows with NaN values in the 'viewCount' column
df = df.loc[~df['viewCount'].isna()]

In [None]:
df.head()

# Casting dtypes

In [None]:
df.dtypes

In [None]:
# Filling NaN values with 0 and converting columns to integers
df['viewCount'] = df['viewCount'].fillna(0).astype('int')
df['likeCount'] = df['likeCount'].fillna(0).astype('int')

In [None]:
df.dtypes

In [None]:
# Converting 'publishTime' column to datetime
df['publishTime'] = pd.to_datetime(df['publishTime'])

In [None]:
# Converting 'likeCount' column to numeric
df['likeCount'] = pd.to_numeric(df['likeCount'].astype('str'))

In [None]:
df.dtypes

# Creating New Columns

In [None]:
# Calculate like-to-view ratio
df['like_to_view_ratio'] = df['likeCount'] / df['viewCount']

In [None]:
df.head(1)

# Adding new rows

In [None]:
# Create a DataFrame to append
df_to_append = df.tail(1)

In [None]:
# Concatenate the DataFrame with the DataFrame to append
df_concat_example = pd.concat([df, df_to_append])

In [None]:
df_concat_example.tail(3)

# Plot Examples

In [None]:
# Plot the distribution of view count
import matplotlib.pyplot as plt
df['viewCount'].plot(kind='hist', bins=50, title='Distribution of View Count', figsize=(15, 5))
plt.xlabel('View Count')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Scatter plot of view count vs like count
df.plot(kind='scatter', x='viewCount', y='likeCount', title='View Count vs Like Count', figsize=(10, 6))
plt.xlabel('View Count')
plt.ylabel('Like Count')
plt.show()

In [None]:
# Filter rows where like count is greater than 10,000,000
df_high_likes = df.query('likeCount > 10000000')


# Save Output

In [None]:
# Save the processed DataFrame to a CSV file
df.to_csv('processed_data.csv', index=False)

# Done!!!