**TASK-2**

Cleaning Data

In [1]:
import pandas as pd

# Load the dataset
file_path = r"/content/USvideos.csv"
data = pd.read_csv(file_path)

# 1. Data Integrity: Ensuring accuracy, consistency, and reliability
# Checking for any unexpected data types or potential issues in each column
print("Initial data info:")
print(data.info())
print("\nSummary statistics:")
print(data.describe(include='all'))

Initial data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40900 entries, 0 to 40899
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   video_id                40900 non-null  object 
 1   trending_date           40900 non-null  object 
 2   title                   40900 non-null  object 
 3   channel_title           40900 non-null  object 
 4   category_id             40900 non-null  int64  
 5   publish_time            40900 non-null  object 
 6   tags                    40900 non-null  object 
 7   views                   40900 non-null  float64
 8   likes                   40900 non-null  float64
 9   dislikes                40900 non-null  int64  
 10  comment_count           40900 non-null  float64
 11  thumbnail_link          40900 non-null  object 
 12  comments_disabled       40900 non-null  bool   
 13  ratings_disabled        40900 non-null  bool   
 14  video_error_or_remo

In [2]:
# 2. Missing Data Handling: Fill or drop missing values
# For numerical columns, we use median imputation, and for categorical columns, we use the mode.

# Impute missing values
for column in data.columns:
    if data[column].dtype == 'object':  # Categorical column
        data[column] = data[column].fillna(data[column].mode()[0])
    else:  # Numerical column
        data[column] = data[column].fillna(data[column].median())

In [3]:
# 3. Duplicate Removal: Remove duplicate rows
initial_row_count = data.shape[0]
data = data.drop_duplicates()
final_row_count = data.shape[0]
print(f"\nDuplicates removed: {initial_row_count - final_row_count}")


Duplicates removed: 0


In [4]:
# 4. Standardization: Ensure consistent formatting and units
# Convert 'date' column to datetime format if it exists
if 'date' in data.columns:
    data['date'] = pd.to_datetime(data['date'], errors='coerce')

In [5]:
# Standardize all text columns to lowercase for consistency
for column in data.select_dtypes(include=['object']).columns:
    data[column] = data[column].str.lower()

In [6]:
# 5. Outlier Detection: Identify and handle outliers in numerical columns
# Using the IQR (Interquartile Range) method to filter outliers

def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)

# Apply the outlier handling to numerical columns
for column in data.select_dtypes(include=['float64', 'int64']).columns:
    handle_outliers(data, column)

# Display the cleaned dataset
print("\nData after cleaning:")
print(data.info())
print(data.head())

# Save the cleaned data to a new CSV file
data.to_csv(r"/content/USvideos.csv", index=False)
print("\nCleaned data saved to 'USvideos_cleaned.csv'")


Data after cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40900 entries, 0 to 40899
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   video_id                40900 non-null  object 
 1   trending_date           40900 non-null  object 
 2   title                   40900 non-null  object 
 3   channel_title           40900 non-null  object 
 4   category_id             40900 non-null  int64  
 5   publish_time            40900 non-null  object 
 6   tags                    40900 non-null  object 
 7   views                   40900 non-null  float64
 8   likes                   40900 non-null  float64
 9   dislikes                40900 non-null  int64  
 10  comment_count           40900 non-null  float64
 11  thumbnail_link          40900 non-null  object 
 12  comments_disabled       40900 non-null  bool   
 13  ratings_disabled        40900 non-null  bool   
 14  video_error_or_r