# Logistic Regression Assignment

### importing

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Data

In [6]:
df = pd.read_csv('bikebuyer.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6997 entries, 0 to 6996
Data columns (total 13 columns):
ID                  6996 non-null float64
Marital Status      6981 non-null object
Gender              6968 non-null object
Yearly Income       6997 non-null int64
Children            6979 non-null float64
Education           6997 non-null object
Occupation          6997 non-null object
Home Owner          6997 non-null object
Cars                6997 non-null int64
Commute Distance    6968 non-null float64
Region              6997 non-null object
Age                 6997 non-null int64
Bike Buyer          6997 non-null object
dtypes: float64(3), int64(3), object(7)
memory usage: 710.8+ KB


In [8]:
df.head()

Unnamed: 0,ID,Marital Status,Gender,Yearly Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Bike Buyer
0,22711.0,Single,Male,30000,0.0,Partial College,Clerical,No,1,1.0,Europe,33,Yes
1,13555.0,Married,Female,40000,0.0,Graduate Degree,Clerical,Yes,0,1.0,Europe,37,Yes
2,,Married,Male,160000,5.0,Partial College,Professional,No,3,2.0,Europe,55,No
3,2.0,Single,Male,160000,0.0,Graduate Degree,Management,Yes,2,5.0,Pacific,47,No
4,25410.0,,Female,70000,2.0,Bachelors,Skilled Manual,No,1,1.0,North America,38,Yes


### Checking for Null values

In [9]:
df.isnull().any()

ID                   True
Marital Status       True
Gender               True
Yearly Income       False
Children             True
Education           False
Occupation          False
Home Owner          False
Cars                False
Commute Distance     True
Region              False
Age                 False
Bike Buyer          False
dtype: bool

### How many null values

In [10]:
df.isnull().sum()

ID                   1
Marital Status      16
Gender              29
Yearly Income        0
Children            18
Education            0
Occupation           0
Home Owner           0
Cars                 0
Commute Distance    29
Region               0
Age                  0
Bike Buyer           0
dtype: int64

### Handling Null Values

In [11]:
# Dropping row with empty ID

df.dropna(axis = 0, inplace = True, subset = ['ID'])

In [12]:
# Martial Status

marriage = df['Marital Status'].mode()

df['Marital Status'].fillna(marriage[0], inplace = True)

In [13]:
# Gender

gender = df['Gender'].mode()

df['Gender'].fillna(gender[0], inplace = True)

In [14]:
# Children

df['Children'].fillna(df['Children'].mean(), inplace = True)

In [15]:
# Commute Distance

df['Commute Distance'].fillna(df['Commute Distance'].mean(), inplace = True)

In [16]:
df.isnull().sum()

ID                  0
Marital Status      0
Gender              0
Yearly Income       0
Children            0
Education           0
Occupation          0
Home Owner          0
Cars                0
Commute Distance    0
Region              0
Age                 0
Bike Buyer          0
dtype: int64

### Split Dependent and Independent Variable

In [17]:
# Independent Variables

x = df.iloc[:, 0:12].values

In [18]:
x

array([[22711.0, 'Single', 'Male', ..., 1.0, 'Europe', 33],
       [13555.0, 'Married', 'Female', ..., 1.0, 'Europe', 37],
       [2.0, 'Single', 'Male', ..., 5.0, 'Pacific', 47],
       ...,
       [22823.0, 'Married', 'Female', ..., 1.0, 'Europe', 53],
       [22825.0, 'Single', 'Female', ..., 1.0, 'Europe', 54],
       [22826.0, 'Married', 'Male', ..., 2.0, 'Europe', 54]], dtype=object)

In [19]:
# Dependent Variable

y = df.iloc[:, -1].values

In [20]:
y

array(['Yes', 'Yes', 'No', ..., 'No', 'No', 'No'], dtype=object)

### Handling Categorical Data

In [21]:
# importing

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# object creation

ct = ColumnTransformer([('oe', OneHotEncoder(), [1, 2, 5, 6, 7, 10])], remainder = 'passthrough')

In [22]:
# Encoding the data

x_encode = ct.fit_transform(x)

In [23]:
x_encode

array([[0.0, 1.0, 0.0, ..., 1, 1.0, 33],
       [1.0, 0.0, 1.0, ..., 0, 1.0, 37],
       [0.0, 1.0, 0.0, ..., 2, 5.0, 47],
       ...,
       [1.0, 0.0, 1.0, ..., 2, 1.0, 53],
       [0.0, 1.0, 1.0, ..., 3, 1.0, 54],
       [1.0, 0.0, 0.0, ..., 3, 2.0, 54]], dtype=object)

In [24]:
# importing

from sklearn.preprocessing import LabelEncoder

# object Creation

le = LabelEncoder()

In [25]:
# Encoding the Data

y_encode = le.fit_transform(y)

In [26]:
y_encode

array([1, 1, 0, ..., 0, 0, 0])

### Split Train and Test Data

In [27]:
# importing

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_encode, y_encode, test_size = 0.2, random_state = 0)

### Building the Model

In [28]:
# importing

from sklearn.linear_model import LogisticRegression

# object creation

log = LogisticRegression()

In [29]:
# build the model

log.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### Predciting the values

In [30]:
y_pred = log.predict(x_test)

In [31]:
# predicted values

y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [32]:
# actual values

y_test

array([0, 0, 0, ..., 0, 0, 0])

### Accuracy

In [33]:
# importing

from sklearn.metrics import accuracy_score

# accuracy

accuracy = accuracy_score(y_pred, y_test)

In [34]:
accuracy

0.8628571428571429

## User Inputs

In [35]:
df.head(1)

Unnamed: 0,ID,Marital Status,Gender,Yearly Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Bike Buyer
0,22711.0,Single,Male,30000,0.0,Partial College,Clerical,No,1,1.0,Europe,33,Yes


In [34]:
# Inputs are: ID, Marital Status(Married, Single), Gender(Male, Female), Yearly Income, Children, 
#             Education(Studied), Occupation(Job Doing), 
#             Home Owner(Yes, No), Cars, Commute Distance, Region(Name of the Region), Age


user_input = [12345, 'Married', 'Female', 45000, 1, 'Graduate Degree', 'Management', 'No', 0, 1.5, 'North America', 25]

In [36]:
# Encoding user_input

user_input_encode = ct.transform([user_input])

NameError: name 'user_input' is not defined

In [36]:
# Predicting the Output

if(log.predict(user_input_encode)[0]):
    print("Yes")
else:
    print("No")

No


---------------------------------------------------





----------------------------------------------------------------------------------