# Importing Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Importing DataSet And viewing dataset

In [3]:
df = pd.read_csv('in-vehicle-coupon-recommendation.csv')
df.head()


Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0


In [4]:
df.nunique()

destination              3
passanger                4
weather                  3
temperature              3
time                     5
coupon                   5
expiration               2
gender                   2
age                      8
maritalStatus            5
has_children             2
education                6
occupation              25
income                   9
car                      5
Bar                      5
CoffeeHouse              5
CarryAway                5
RestaurantLessThan20     5
Restaurant20To50         5
toCoupon_GEQ5min         1
toCoupon_GEQ15min        2
toCoupon_GEQ25min        2
direction_same           2
direction_opp            2
Y                        2
dtype: int64

In [5]:
df.skew()
df.isnull().sum().sort_values(ascending=False) * 100 / len(df)
df['Y'].value_counts() / len(df)

1    0.568433
0    0.431567
Name: Y, dtype: float64

# dividing independent and dependent variable

In [6]:
df.duplicated().sum()
df.duplicated().sum()
X, Y = df.iloc[:, :-1], df.iloc[:, -1]
print(X.columns)
print(X.shape, Y.shape)


Index(['destination', 'passanger', 'weather', 'temperature', 'time', 'coupon',
       'expiration', 'gender', 'age', 'maritalStatus', 'has_children',
       'education', 'occupation', 'income', 'car', 'Bar', 'CoffeeHouse',
       'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50',
       'toCoupon_GEQ5min', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min',
       'direction_same', 'direction_opp'],
      dtype='object')
(12684, 25) (12684,)


# Classifying independent Variable Into Buckets

In [7]:
drop_attrs = ['toCoupon_GEQ5min', 'direction_opp']
binary_attrs = ['gender', 'has_children', 'direction_same']
frequency_attrs = ['Bar', 'CoffeeHouse', 'CarryAway','RestaurantLessThan20', 'Restaurant20To50']
ordinal_attrs = ['temperature', 'time', 'expiration', 'age', 'education', 'income', 'toCoupon']
nominal_attrs = ['car','destination', 'passanger', 'weather', 'coupon', 'maritalStatus', 'occupation']

# Proccessing Binary attribute

In [8]:
X = X.drop(columns='toCoupon_GEQ5min')
gender_encoder = LabelEncoder()
X['gender'] = gender_encoder.fit_transform(X['gender'])
X_binary = X[binary_attrs]

In [9]:
print(X_binary.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   gender          12684 non-null  int32
 1   has_children    12684 non-null  int64
 2   direction_same  12684 non-null  int64
dtypes: int32(1), int64(2)
memory usage: 247.9 KB
None


# Processing Categorical Attribute

In [10]:
income_order = ['Less than $12500', '$12500 - $24999', 
                '$25000 - $37499', '$37500 - $49999', 
                '$50000 - $62499', '$62500 - $74999', 
                '$75000 - $87499', '$87500 - $99999', 
                '$100000 or More']
edu_order = ['Some High School', 'High School Graduate', 
             'Some college - no degree', 'Associates degree', 
             'Bachelors degree', 
             'Graduate degree (Masters or Doctorate)']
age_order = ['below21', '21', '26', '31', '36', '41', '46', '50plus']
time_order = ['7AM', '10AM', '2PM', '6PM', '10PM']
expiration_order = ['2h', '1d']

In [11]:
def reorder_encoding(attr, order):
    X[attr] = X[attr].astype('category')
    X[attr] = X[attr].cat.reorder_categories(
        new_categories=order,
        ordered=True
    )

    return pd.Series(X[attr].cat.codes, name=attr)


X_income = reorder_encoding('income', income_order)
X_edu = reorder_encoding('education', edu_order)
X_age = reorder_encoding('age', age_order)
X_time = reorder_encoding('time', time_order)
X_expiration = reorder_encoding('expiration', expiration_order)

# Processing Frequency Attribute

In [12]:
freq_order = {
    'never': 0, 
    'less1': 1,
    '1~3': 2, 
    '4~8': 3, 
    'gt8': 4
}

for col in frequency_attrs:
    X[col] = X[col].map(freq_order, na_action='ignore')

In [13]:
to_coupon = X[['toCoupon_GEQ15min', 'toCoupon_GEQ25min']].to_numpy()
t = np.sum(to_coupon, axis=1)
X_toCoupon = pd.Series(
    data=np.sum(to_coupon, axis=1),
    name='toCoupon'
)

# Processing Nominal Attribute

In [14]:
X_nominal = pd.get_dummies(X[nominal_attrs])
for t in [X_binary, X_income, X_edu, X_age, X_time, X_expiration,
     X[frequency_attrs], X_toCoupon, X_nominal, Y]:
    print(t.shape)

(12684, 3)
(12684,)
(12684,)
(12684,)
(12684,)
(12684,)
(12684, 5)
(12684,)
(12684, 50)
(12684,)


# Merging all The attribute

In [15]:
onehot_df = pd.concat(
    [X_binary, 
     X_income, 
     X_edu, 
     X_age, 
     X_time, 
     X_expiration,
     X[frequency_attrs], 
     X_toCoupon, 
     X_nominal, 
     Y],
    axis=1
)
print(onehot_df.shape)

(12684, 65)


# Getting Processed File

In [16]:
onehot_df.duplicated().sum()
onehot_df.drop_duplicates(keep='first', inplace=True)
onehot_df.to_csv('preprocessed.csv', index=False)