# Data Science Basics - Logistic Regression - SUV Dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import math

suv_data = pd.read_csv("data/SUV.csv")
suv_data.head(10)

In [None]:
print("# of SUVs in original data: " + str(len(suv_data.index)))

## Analyzing Data

In [None]:
sns.countplot(x="Purchased", data=suv_data)

In [None]:
sns.countplot(x="Purchased", hue="Age", data=suv_data)

In [None]:
sns.countplot(x="Purchased", hue="EstimatedSalary", data=suv_data)

In [None]:
sns.countplot(x="Purchased", hue="Gender", data=suv_data)

In [None]:
suv_data["Age"].plot.hist()

In [None]:
suv_data["EstimatedSalary"].plot.hist()

In [None]:
suv_data.info()

In [None]:
suv_data.drop("User ID", axis=1, inplace=True)

In [None]:
suv_data.info()

## Data Wrangling

In [None]:
suv_data.isnull()

In [None]:
suv_data.isnull().sum()

In [None]:
sns.heatmap(suv_data.isnull(), yticklabels=False)

In [None]:
sns.heatmap(suv_data.isnull(), yticklabels=False, cmap="viridis")

In [None]:
sns.boxplot(x="Age", y="EstimatedSalary", data=suv_data)

In [None]:
sns.boxplot(x="EstimatedSalary", y="Age", data=suv_data)

In [None]:
pd.get_dummies(suv_data["Gender"])

In [None]:
sex = pd.get_dummies(suv_data["Gender"], drop_first=True)
sex.head(5)

In [None]:
suv_data = pd.concat([suv_data, sex], axis=1)

In [None]:
suv_data.head(5)

In [None]:
suv_data.drop(["Gender"], axis=1, inplace=True)

In [None]:
suv_data.head(5)

## Train Data

In [None]:
x = suv_data.iloc[:, [0, 1]].values
y = suv_data.iloc[:, 2].values

## Independent Variable X is Age and Salary

In [None]:
x

## dependent Variable Y is Purchased

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=0
)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
classifier = LogisticRegression(random_state=0)
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_test, y_pred)