In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

## Reading the Data

In [None]:
df = pd.read_csv("../input/car-price-prediction/CarPrice_Assignment.csv")
df.head(3)

In [None]:
df.shape

In [None]:
df.describe()

 - Take only company name from **"CarName"** column

In [None]:
Company = df['CarName'].apply(lambda x : x.split(" ")[0])
df.insert(2,"Company",Company)
df.head(3)

- Remove **"CarName", "car_ID", "symboling"** columns

In [None]:
df.drop(["CarName", "car_ID", "symboling"], axis=1, inplace=True)
df.head(3)

In [None]:
df.head(3)

 - Renaming incorrect companies names in **"Company"** column

In [None]:
print(f"Unique companies names {len(df.Company.str.lower().value_counts())}")
df.Company.str.lower().value_counts()

In [None]:
df.Company = df.Company.str.lower()
df.Company.replace(
    {
        "alfa-romero":"alfa romeo",
        "maxda":"mazda",
        "vw": "volkswagen",
        "vokswagen":"volkswagen",
        "porcshce": "porsche",
        "toyouta":"toyota",        
    },
    inplace=True
)

In [None]:
print(f"Unique companies names {len(df.Company.str.lower().value_counts())}")
df.Company.str.lower().value_counts()

### Data visualization

In [None]:
plt.figure(figsize=(15, 7))

plt.subplot(1,2,1)
plt.title('Distribution of car prices')
sns.distplot(df.price, color="g")

plt.subplot(1,2,2)
plt.title('Spread of car prices')
sns.boxplot(y=df.price, color="g")

plt.show()

In [None]:
df.Company.value_counts()

In [None]:
colors = []
for x in range(40):
    r = random.random()
    b = random.random()
    g = random.random()
    color = (r, g, b)
    colors.append(color)

In [None]:
plt.figure(figsize=(22, 10))
plt.rcParams.update({'font.size': 15})
plt1 = df.Company.value_counts().plot(kind='bar', color=colors)
plt.title('Companies')
plt.xlabel("Company")
plt.xticks(rotation=90)
plt.ylabel("Frequency")

plt.show()

In [None]:
plt.subplot(1, 2, 1)
plt1 = df.fueltype.value_counts().plot(kind='bar', color=colors)
plt.title('Fuel Type')
plt1.set(xlabel = 'Type', ylabel='Frequency')

plt.subplot(1, 2, 2)
plt1 = df.carbody.value_counts().plot(kind='bar', color=colors)
plt.title('Car Type')
plt1.set(xlabel = 'Type', ylabel='Frequency')

plt.tight_layout()
plt.show()

In [None]:
sns.boxplot(x=df.doornumber, y=df.price, palette=('flare'))

plt.show()

In [None]:
sns.boxplot(x=df.enginetype, y=df.price, palette=('viridis'))

plt.show()

In [None]:
pd.DataFrame(df.groupby(['Company'])['price'].mean(). \
             sort_values(ascending = False)).plot.bar(color=random.choice(colors))
plt.title('Comparing: Company to average price')

plt.show()

In [None]:
pd.DataFrame(df.groupby(['carbody'])['price'].mean(). \
             sort_values(ascending = False)).plot.bar(color=random.choice(colors))
plt.title('Comparing: Carbody to average price')

plt.show()

In [None]:
pd.DataFrame(df.groupby(['fueltype'])['price'].mean(). \
             sort_values(ascending = False)).plot.bar(color=random.choice(colors))
plt.title('Comparing: Fuel type to average price')

plt.show()

In [None]:
COLOR = "RdYlBu"

def plot_count(feature, fig):
    plt.subplot(4, 2, fig)
    plt.title(feature +' Histogram')
    sns.countplot(df[feature],palette=(COLOR))
    plt.subplot(4,2,(fig+1))
    plt.title(f'Comparing: {feature} to price')
    sns.boxplot(x=df[feature], y=df.price, palette=(COLOR))
    
plt.figure(figsize=(20,25))

plot_count('enginelocation', 1)
plot_count('cylindernumber', 3)
plot_count('fuelsystem', 5)
plot_count('drivewheel', 7)

plt.tight_layout()


## Correlation

In [None]:
sns.heatmap(df.corr().sort_values(by="price", ascending=False).iloc[1:11,-1:], cmap="jet", annot=True)

In [None]:
strongest_features = df.corr().sort_values(by="price", ascending=False).iloc[:11,:].index
strongest_features

In [None]:
df_for_model = df[strongest_features]
df_for_model

In [None]:
X = df_for_model.iloc[:,1:]
y = df_for_model.price

np.random.seed(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=76)

In [None]:
sample = X_train.sample().index[0]
X_train.loc[[sample]]

In [None]:
y_train.loc[[sample]].to_frame()

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
pd.DataFrame(model.coef_, index= X.columns, columns=["Factor"])

# That means if enginesize + 1 then price + 86.68

In [None]:
y_pred = model.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:

plt.scatter(y_test,y_pred)
plt.title("Comparing: \nX_test and y_pred", fontsize=20)

plt.xlabel('y_test', fontsize=15)
plt.ylabel('y_pred', fontsize=15)

plt.grid()
plt.show()