<a href="https://colab.research.google.com/github/pradhansankalp10/Data-Cleaning/blob/main/Data_Clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import zipfile
import os

def load_and_unzip_dataset(zip_file_path, destination_folder):
  """Loads and unzips a dataset into a new folder.

  Args:
    zip_file_path: Path to the zip file containing the dataset.
    destination_folder: Name of the new folder to store the dataset.
  """
  if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

  with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(destination_folder)

zip_file_path = '/content/Animal Dataset.zip'
destination_folder = 'Sankalp'

load_and_unzip_dataset(zip_file_path, destination_folder)

print(f"Dataset unzipped to: {destination_folder}")

Dataset unzipped to: Sankalp


In [2]:
import pandas as pd
data=pd.read_csv('/content/Sankalp/Animal Dataset.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Animal                   205 non-null    object
 1   Height (cm)              205 non-null    object
 2   Weight (kg)              205 non-null    object
 3   Color                    205 non-null    object
 4   Lifespan (years)         205 non-null    object
 5   Diet                     205 non-null    object
 6   Habitat                  205 non-null    object
 7   Predators                205 non-null    object
 8   Average Speed (km/h)     205 non-null    object
 9   Countries Found          205 non-null    object
 10  Conservation Status      205 non-null    object
 11  Family                   205 non-null    object
 12  Gestation Period (days)  205 non-null    object
 13  Top Speed (km/h)         205 non-null    object
 14  Social Structure         205 non-null    o

In [3]:
import pandas as pd
df = pd.read_csv('/content/Sankalp/Animal Dataset.csv')
df.shape

(205, 16)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Animal                   205 non-null    object
 1   Height (cm)              205 non-null    object
 2   Weight (kg)              205 non-null    object
 3   Color                    205 non-null    object
 4   Lifespan (years)         205 non-null    object
 5   Diet                     205 non-null    object
 6   Habitat                  205 non-null    object
 7   Predators                205 non-null    object
 8   Average Speed (km/h)     205 non-null    object
 9   Countries Found          205 non-null    object
 10  Conservation Status      205 non-null    object
 11  Family                   205 non-null    object
 12  Gestation Period (days)  205 non-null    object
 13  Top Speed (km/h)         205 non-null    object
 14  Social Structure         205 non-null    o

In [5]:
import pandas as pd
import os

dataset_path = '/content/Sankalp/Animal Dataset.csv'
animal_data = pd.read_csv(dataset_path)

for column in animal_data.columns:
    if animal_data[column].dtype == 'object':
        animal_data[column].fillna(animal_data[column].mode()[0], inplace=True)
    else:
        animal_data[column].fillna(animal_data[column].median(), inplace=True)

animal_data.drop_duplicates(inplace=True)

text_columns = animal_data.select_dtypes(include='object').columns
animal_data[text_columns] = animal_data[text_columns].apply(lambda x: x.str.lower().str.strip())

clean_dir = '/mnt/data/Clean'
os.makedirs(clean_dir, exist_ok=True)
cleaned_data_path = os.path.join(clean_dir, 'Cleaned_Animal_Dataset.csv')

animal_data.to_csv(cleaned_data_path, index=False)

print(f"Cleaned dataset saved at: {cleaned_data_path}")


Cleaned dataset saved at: /mnt/data/Clean/Cleaned_Animal_Dataset.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  animal_data[column].fillna(animal_data[column].mode()[0], inplace=True)


In [6]:
import os
clean_dir = 'CLEANED'
os.makedirs(clean_dir, exist_ok=True)
cleaned_data_path = os.path.join(clean_dir, 'cleaned_dataset.csv')
animal_data.to_csv(cleaned_data_path, index=False)
print(f"Cleaned dataset saved at: {cleaned_data_path}")

Cleaned dataset saved at: CLEANED/cleaned_dataset.csv


In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import os

cleaned_data_path = '/content/CLEANED/cleaned_dataset.csv'
animal_data = pd.read_csv(cleaned_data_path)
print("Original Data Types:\n", animal_data.dtypes)

numeric_columns = animal_data.select_dtypes(include=['number']).columns

for col in animal_data.columns:
    if animal_data[col].dtype == 'object':
        if any(char.isdigit() for char in animal_data[col].iloc[0] if char not in ['.', '-']):
            try:
                animal_data[col] = pd.to_numeric(animal_data[col].str.replace(r'[^0-9.-]', '', regex=True), errors='coerce')
            except ValueError:
                print(f"Column '{col}' could not be fully converted to numeric. Check for more complex non-numeric patterns.")
        else:
            print(f"Column '{col}' is not numeric and won't be scaled.")

numeric_columns = animal_data.select_dtypes(include=['number']).columns
categorical_columns = animal_data.select_dtypes(include=['object']).columns
print("\nData Types After Conversion:\n", animal_data.dtypes)
if len(numeric_columns) > 0:
    scaler = MinMaxScaler()
    animal_data[numeric_columns] = scaler.fit_transform(animal_data[numeric_columns])
else:
    print("No numeric columns found for scaling.")

animal_data = pd.get_dummies(animal_data, columns=categorical_columns, drop_first=True)

processed_data_path = '/mnt/data/Clean/Processed_Animal_Dataset.csv'

animal_data.to_csv(processed_data_path, index=False)

print(f"Processed dataset saved at: {processed_data_path}")

Original Data Types:
 Animal                     object
Height (cm)                object
Weight (kg)                object
Color                      object
Lifespan (years)           object
Diet                       object
Habitat                    object
Predators                  object
Average Speed (km/h)       object
Countries Found            object
Conservation Status        object
Family                     object
Gestation Period (days)    object
Top Speed (km/h)           object
Social Structure           object
Offspring per Birth        object
dtype: object
Column 'Animal' is not numeric and won't be scaled.
Column 'Color' is not numeric and won't be scaled.
Column 'Diet' is not numeric and won't be scaled.
Column 'Habitat' is not numeric and won't be scaled.
Column 'Predators' is not numeric and won't be scaled.
Column 'Countries Found' is not numeric and won't be scaled.
Column 'Conservation Status' is not numeric and won't be scaled.
Column 'Family' is not numeric an

In [10]:
import os
processed_dir = 'Preprocessed'
os.makedirs(processed_dir, exist_ok=True)
processed_data_path = os.path.join(processed_dir, 'Processed_Animal_Dataset.csv')
animal_data.to_csv(processed_data_path, index=False)
print(f"Processed dataset saved at: {processed_data_path}")

Processed dataset saved at: Preprocessed/Processed_Animal_Dataset.csv
