In [1]:
#importing Dependencies
import pandas as pd
import re

In [2]:
# Load the data from a CSV file
df = pd.read_csv('assignment_3_data.csv')

# Display the first few rows of the DataFrame
df.head(10)

Unnamed: 0,x0,x1,x34,x35,x36,x41,x45,x46,x68,x84,x93,x94
0,0.196063,3.683995,tesla,thurday,-4.664154,$124.72,-0.01%,14.360555,Jun,-11.07277,asia,0.023357
1,23.155197,-9.226725,Toyota,thur,5.220242,"$1,273.04",-0.01%,0.328324,July,4.601376,asia,1.349389
2,-13.969618,-0.343978,bmw,wednesday,-0.061424,"($1,651.19)",0.00%,3.824882,Aug,17.351746,asia,2.149993
3,49.722481,-1.500789,Toyota,wed,1.908461,$896.05,0.01%,-6.201024,Aug,6.084073,asia,-3.052037
4,-13.494403,7.445215,Honda,wednesday,0.074133,"($1,710.27)",0.01%,-12.547338,May,10.035783,asia,-0.10066
5,-52.928678,20.318698,Toyota,thurday,3.970205,($634.94),0.01%,3.714089,July,5.796239,asia,-2.98281
6,-5.095166,10.804072,bmw,thurday,3.87995,$272.90,-0.01%,-1.694294,Jun,8.353069,asia,-0.948726
7,5.49862,-9.664807,ford,wed,1.576983,"($1,659.69)",-0.01%,-1.97168,Jun,-10.427572,asia,0.849058
8,-77.503482,15.459621,volkswagon,thurday,1.976971,"($1,061.87)",0.00%,-8.424532,Jun,3.340398,america,-1.34246
9,43.214532,-25.647894,Honda,thurday,-2.755702,$526.37,0.00%,-7.966714,July,-2.655538,asia,-0.290636


In [3]:
# Check data summary
df.describe()

Unnamed: 0,x0,x1,x36,x46,x84,x94
count,999.0,999.0,999.0,999.0,999.0,999.0
mean,6.60145,-3.743892,0.252498,-0.37418,0.128428,-0.069708
std,28.66354,16.375312,4.011481,6.005888,8.505289,1.14217
min,-85.778627,-71.023933,-12.407202,-20.142073,-28.005771,-3.639954
25%,-13.251886,-14.37863,-2.372108,-4.278625,-5.753483,-0.861509
50%,5.855221,-4.029751,0.309493,-0.176247,0.145962,-0.06728
75%,25.327003,7.407054,3.039325,3.550714,6.137188,0.676601
max,105.430692,46.404407,14.457481,18.760058,28.922526,3.397178


In [4]:
# Display DataFrame information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x0      999 non-null    float64
 1   x1      999 non-null    float64
 2   x34     999 non-null    object 
 3   x35     999 non-null    object 
 4   x36     999 non-null    float64
 5   x41     999 non-null    object 
 6   x45     998 non-null    object 
 7   x46     999 non-null    float64
 8   x68     999 non-null    object 
 9   x84     999 non-null    float64
 10  x93     999 non-null    object 
 11  x94     999 non-null    float64
dtypes: float64(6), object(6)
memory usage: 93.8+ KB


In [5]:
# Check for null values
df.isnull().sum()

x0     0
x1     0
x34    0
x35    0
x36    0
x41    0
x45    1
x46    0
x68    0
x84    0
x93    0
x94    0
dtype: int64

In [6]:
# Drop rows with null values
df.dropna(inplace=True)

In [7]:
# Check for duplicates
df.duplicated().sum()

0

In [8]:
# Define a function to clean the 'x41' column by removing non-numeric characters
def clean_price(price):
    # Remove non-numeric characters from the price string
    cleaned_price = re.sub(r'[^\d.]', '', price)
    # Convert to float
    return float(cleaned_price)

In [9]:
# Clean up the 'x41' column
df['x41'] = df['x41'].apply(lambda x: clean_price(x))

In [10]:
# Define a function to print unique values in categorical columns
def print_unique_values(data):
    # Ensure that the input is a DataFrame
    if not isinstance(data, pd.DataFrame):
        print("Input is not a DataFrame")
        return

    # Iterate through each column and print unique values for categorical columns
    for column in data.columns:
        if data[column].dtype == 'object':
            unique_values = data[column].unique()
            print(f"Column: {column}")
            for value in unique_values:
                print(value)
            print("\n")


In [11]:
# Print unique values in the DataFrame
print_unique_values(df)

Column: x34
tesla
Toyota
bmw
Honda
ford
volkswagon
chrystler
nissan
mercades
chevrolet


Column: x35
thurday
thur
wednesday
wed
tuesday
monday
friday
fri


Column: x45
-0.01%
0.00%
0.01%
0.02%
-0.02%
-0.03%
0.03%


Column: x68
Jun
July
Aug
May
Apr
sept.
Oct
Dev
Mar
Nov


Column: x93
asia
america
euorpe




In [12]:
# Define the day mapping dictionary for abbreviated names
day_mapping = {
    'wed': 'wednesday',
    'thur': 'thursday',
    'fri': 'friday'
}

# Create a new list with expanded day names
expanded_days = [day_mapping.get(day, day) for day in df['x35']]

# Update the 'x35' column with expanded day names
df['x35'] = expanded_days

# Clean up the 'x35' column by standardizing the spelling to 'Thursday'
df['x35'] = df['x35'].str.lower().replace({'thurday': 'thursday'})

# Remove '%' symbol from the 'x45' column
df['x45'] = df['x45'].str.replace('%', '')

In [13]:
# Define correction dictionaries for specific columns
corrections = {
    'x93': {'euorpe': 'europe'},
    'x68': {'dev': 'dec'},
    'x34': {'mercades': 'mercedes'}
}

# Apply corrections to the specified columns
for col, correction_dict in corrections.items():
    df[col] = df[col].replace(correction_dict)

In [19]:
# Save the cleaned data to a new CSV file
df.to_csv('cleaned.csv', index=False)

In [20]:
df.head(30)


Unnamed: 0,x0,x1,x34,x35,x36,x41,x45,x46,x68,x84,x93,x94
0,0.196063,3.683995,tesla,thursday,-4.664154,124.72,-0.01,14.360555,Jun,-11.07277,asia,0.023357
1,23.155197,-9.226725,Toyota,thursday,5.220242,1273.04,-0.01,0.328324,July,4.601376,asia,1.349389
2,-13.969618,-0.343978,bmw,wednesday,-0.061424,1651.19,0.0,3.824882,Aug,17.351746,asia,2.149993
3,49.722481,-1.500789,Toyota,wednesday,1.908461,896.05,0.01,-6.201024,Aug,6.084073,asia,-3.052037
4,-13.494403,7.445215,Honda,wednesday,0.074133,1710.27,0.01,-12.547338,May,10.035783,asia,-0.10066
5,-52.928678,20.318698,Toyota,thursday,3.970205,634.94,0.01,3.714089,July,5.796239,asia,-2.98281
6,-5.095166,10.804072,bmw,thursday,3.87995,272.9,-0.01,-1.694294,Jun,8.353069,asia,-0.948726
7,5.49862,-9.664807,ford,wednesday,1.576983,1659.69,-0.01,-1.97168,Jun,-10.427572,asia,0.849058
8,-77.503482,15.459621,volkswagon,thursday,1.976971,1061.87,0.0,-8.424532,Jun,3.340398,america,-1.34246
9,43.214532,-25.647894,Honda,thursday,-2.755702,526.37,0.0,-7.966714,July,-2.655538,asia,-0.290636
