<a href="https://colab.research.google.com/github/saisumanth-boyapati/sai/blob/main/notebook1a2258af3e.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'car-price-predictionused-cars:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2491159%2F4226692%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240714%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240714T163116Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D90678d56865c124f127f34800b573cb00fdc960c657e2e83001d8063a401d2c1ee531c5f7c0d6598acd09b1edf971b473b6371bae5d176d37db6ad69c00afc8132494e53472ae065c7cb8241063f1c92f2da3d1718395776fcd7d52cb70cfe7b1541c32f9c4839ff939a549eadef2e556ebfb30cf9e9d2f57b4198f16f821601c0d96b18f229f737e9e0a3bad022b6608ada49b7c6e2f54c682830b0100402c10cee940990864883590198441327c84e0abd077749e3ab9e2af042f3410048fa01e0896862faa7b4735fdc040115479831d4ec355cdde378f20798d4a9d29c6a9fdc2e6ee2e198217693e2b31d9b4861c1b9f31135ff5a1e2ca479d8aef98c7b'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Additional imports.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

Load the dataset and display some rows of the dataframe.

In [None]:
car_data = os.path.join(dirname, filename)
df_cars = pd.read_csv(car_data)
print(f"Shape of the dataset: {df_cars.shape[0]} rows, {df_cars.shape[1]} columns")


In [None]:
df_cars.head()

## Some Explorative Data Analysis  

Unique values of categorical variables.

In [None]:
print(f"Year of the car values: {sorted(df_cars['Year'].unique())}")
print(f"Fuel type values: {df_cars['Fuel_Type'].unique()}")
print(f"Selling type values: {df_cars['Selling_type'].unique()}")
print(f"Transmission values: {df_cars['Transmission'].unique()}")
print(f"Number of owners: {sorted(df_cars['Owner'].unique())}")

Basic statistic of continous variables.

In [None]:
df_cars[['Selling_Price', 'Present_Price', 'Driven_kms']].describe()

Check null values and data types.

In [None]:
df_cars.info()

In [None]:
grp_year = pd.DataFrame(df_cars.groupby(by=['Year'])['Car_Name'].count()).reset_index().rename(columns={'Car_Name': 'No of Cars Sold'})
fig, ax = plt.subplots(figsize=(9,5))
sns.despine(fig)
sns.barplot(grp_year, x='Year', y='No of Cars Sold', color='tab:blue')
ax.set_xlabel('Year of the car')
ax.set_ylabel('No. of cars sold')
ax.set_title('No. of cars sold by prodcution year')
plt.show()

In [None]:
sns.relplot(df_cars, x='Present_Price', y='Selling_Price', hue='Selling_type')
plt.show()

## Unsupervised Models  

Even if the dataset is not high dimensional some unsupervised models are used to identify certain classes.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF

In [None]:
df_cont = df_cars[['Selling_Price', 'Present_Price', 'Driven_kms']]
scaler = StandardScaler()
X = scaler.fit_transform(df_cont)

In [None]:
tsne = TSNE(n_components=2, random_state=224)
X_trans = tsne.fit_transform(X)

In [None]:
df_trans = pd.DataFrame(data={'tsne_1': X_trans[:,0], 'tsne_2': X_trans[:,1]})
plt.figure(figsize=(6,6))
sns.scatterplot(data=df_trans, x='tsne_1', y='tsne_2', hue=df_cars['Selling_type'])
plt.show()

In [None]:
mds = MDS(n_components=2, random_state=224)
X_trans = mds.fit_transform(X)

In [None]:
df_trans = pd.DataFrame(data={'mds_1': X_trans[:,0], 'mds_2': X_trans[:,1]})
plt.figure(figsize=(6,6))
sns.scatterplot(data=df_trans, x='mds_1', y='mds_2', hue=df_cars['Selling_type'])
plt.show()

In [None]:
pca = PCA(n_components=2, random_state=224)
X_trans = pca.fit_transform(X)

In [None]:
df_trans = pd.DataFrame(data={'pca_1': X_trans[:,0], 'pca_2': X_trans[:,1]})
plt.figure(figsize=(6,6))
sns.scatterplot(data=df_trans, x='pca_1', y='pca_2', hue=df_cars['Selling_type'])
plt.show()

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(df_cont)

nmf = NMF(n_components=2, init='nndsvd', random_state=224)
X_trans = nmf.fit_transform(X)

In [None]:
df_trans = pd.DataFrame(data={'nmf_1': X_trans[:,0], 'nmf_2': X_trans[:,1]})
plt.figure(figsize=(6,6))
sns.scatterplot(data=df_trans, x='nmf_1', y='nmf_2', hue=df_cars['Selling_type'])
plt.show()

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(df_cont)

db = DBSCAN(eps=0.3, min_samples=10).fit(X)
labels = db.labels_

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

In [None]:
unique_labels = set(labels)
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = labels == k

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(
        xy[:, 0],
        xy[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        markersize=14,
    )

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(
        xy[:, 0],
        xy[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        markersize=6,
    )

plt.title(f"Estimated number of clusters: {n_clusters_}")
plt.show()


## Simple Classification  

Use the `Selling_Price`, `Present_Price` and `Driven_kms` to predict the `Selling_type`, whether the selling type is `Dealer` or `Individual`. A default Logistic Regression classifier is used on the scaled values of the dataset.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
X = df_cars[['Selling_Price', 'Present_Price', 'Driven_kms']]
y = df_cars['Selling_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=224)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

lr = LogisticRegression(random_state=224)
lr.fit(X_scaled, y_train)

X_test_scaled = scaler.transform(X_test)
y_pred = lr.predict(X_test_scaled)

print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

## Price prediction  

Using a linear regression model to predict the price.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [None]:
X = df_cars[['Present_Price', 'Driven_kms']]
y = df_cars['Selling_Price']

lr = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=224)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print(f"r^2 score: {r2_score(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred)}")

In [None]:
df_cars.sort_values(by=['Selling_Price'], ascending=False).head(10)

Remove outlier.

In [None]:
df_cars = df_cars[df_cars['Selling_Price'] < 30.0]

In [None]:
X = df_cars[['Present_Price', 'Driven_kms']]
y = df_cars['Selling_Price']

lr = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=224)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print(f"r^2 score: {r2_score(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred)}")

In [None]:
from sklearn.linear_model import Ridge

In [None]:
X = df_cars[['Present_Price', 'Driven_kms']]
y = df_cars['Selling_Price']

ridge = Ridge(alpha=0.01)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=224)
ridge.fit(X_train, y_train)

y_pred = ridge.predict(X_test)

print(f"r^2 score: {r2_score(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred)}")