# หนังสือ Data Science วิเคราะห์การตลาด ด้วย Python
---
### บทที่ 7 เทคนิคการถดถอย และค้นหาฟีเจอร์ที่สำคัญ

#### ตัวอย่าง 7.1

In [1]:
# ตัวอย่าง 7.1
import pandas as pd
df = pd.read_csv('data/location_rev.csv')
df.head()

from sklearn.model_selection import train_test_split
X = df[['num_competitors','median_income','num_loyalty_members',
        'population_density','location_age']]
y = df['revenue']
X_train,X_test,y_train,y_test = train_test_split(X, y,
                                                 random_state=1)

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

predictions = model.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error
print('RMSE: ' + str(mean_squared_error(predictions, y_test)**0.5))
print('MAE: ' + str(mean_absolute_error(predictions, y_test)))

X_train2 = X_train.drop('num_competitors', axis=1)
X_test2 = X_test.drop('num_competitors', axis=1)
model.fit(X_train2, y_train)
predictions2 = model.predict(X_test2)

print('RMSE: ' + str(mean_squared_error(predictions2, y_test)**0.5))
print('MAE: ' + str(mean_absolute_error(predictions2, y_test)))

Unnamed: 0,revenue,num_competitors,median_income,num_loyalty_members,population_density,location_age
0,42247.8,3.0,30527.57,1407.0,3302.0,12.0
1,38628.37,3.0,30185.49,1025.0,4422.0,11.0
2,39715.16,1.0,32182.24,1498.0,3260.0,12.0
3,35593.3,5.0,29728.65,2340.0,4325.0,10.0
4,35128.18,4.0,30691.17,847.0,3774.0,11.0


#### กิจกรรม 7.1

In [2]:
# กิจกรรม 7.1
import pandas as pd
df = pd.read_csv('data/offer_responses.csv')
df.head()

X = df[['offer_quality','offer_discount','offer_reach']]
y = df['responses']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y,
                                                 random_state=1)

# all variables
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
model = LinearRegression()
model.fit(X_train,y_train)
predictions1 = model.predict(X_test)
print('RMSE with all variables: '\
      +str(mean_squared_error(predictions1, y_test)**0.5))

# without offer quality
X_train2 = X_train.drop('offer_quality',axis=1)
X_test2 = X_test.drop('offer_quality',axis=1)
model = LinearRegression()
model.fit(X_train2,y_train)
predictions2 = model.predict(X_test2)
print('RMSE without offer quality: '\
      +str(mean_squared_error(predictions2, y_test)**0.5))

# without offer discount
X_train3 = X_train.drop('offer_discount',axis=1)
X_test3 = X_test.drop('offer_discount',axis=1)
model = LinearRegression()
model.fit(X_train3,y_train)
predictions = model.predict(X_test3)
print('RMSE without offer discount: '\
      +str(mean_squared_error(predictions, y_test)**0.5))

# without offer reach
X_train4 = X_train.drop('offer_reach',axis=1)
X_test4 = X_test.drop('offer_reach',axis=1)
model = LinearRegression()
model.fit(X_train4,y_train)
predictions = model.predict(X_test4)
print('RMSE without offer reach: '\
      +str(mean_squared_error(predictions, y_test)**0.5))

RMSE with all variables: 1034.8208821019043
RMSE without offer quality: 1033.698152671967
RMSE without offer discount: 1228.103284778811
RMSE without offer reach: 1199.7110950944152


#### ตัวอย่าง 7.2

In [None]:
# ตัวอย่าง 7.2
import pandas as pd
df = pd.read_csv('data/20scores.csv')
df.head()

x_cols = df.columns[1:]
X = df[x_cols]
y = df['revenue']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y,
                                                 random_state=1)

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

model.coef_

from sklearn.feature_selection import RFE
rfe = RFE(estimator=LinearRegression(), n_features_to_select=5)
rfe.fit(X_train,y_train)

for featureNum in range(X_train.shape[1]):
    if rfe.support_[featureNum] == True:
        print("Feature: {}, Rank: {}" \
              .format(X_train.columns[featureNum],
                      rfe.ranking_[featureNum]))
    
X_train_reduced = X_train[X_train.columns[rfe.support_]]
X_test_reduced = X_test[X_train.columns[rfe.support_]]

rfe_model = LinearRegression()
rfe_model.fit(X_train_reduced,y_train)

from sklearn.metrics import mean_squared_error
predictions = model.predict(X_test)
print(f'RMSE = ',mean_squared_error(predictions, y_test)**0.5)

rfe_predictions = rfe_model.predict(X_test_reduced)
print(f'RMSE =',mean_squared_error(rfe_predictions, y_test)**0.5)

#### กิจกรรม 7.2

In [1]:
# กิจกรรม 7.2
import pandas as pd
df = pd.read_csv('data/customer_spend.csv')
df.head()

cols = df.columns[1:]
X = df[cols]
y = df['cur_year_spend']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X, y,
                                               random_state=1)

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
rfe = RFE(estimator=LinearRegression(),n_features_to_select=3)
rfe.fit(X_train,y_train)

for featureNum in range(X_train.shape[1]):
    if rfe.support_[featureNum] == True:
        print("Feature: {}, Rank: {}" \
              .format(X_train.columns[featureNum],
                      rfe.ranking_[featureNum]))

X_train_reduced = X_train[X_train.columns[rfe.support_]]
X_test_reduced = X_test[X_train.columns[rfe.support_]]

rfe_model = LinearRegression()
rfe_model.fit(X_train_reduced,y_train)

from sklearn.metrics import mean_squared_error
rfe_predictions = rfe_model.predict(X_test_reduced)
print(mean_squared_error(rfe_predictions, y_test)**0.5)

Feature: days_since_first_purchase, Rank: 1
Feature: total_transactions, Rank: 1
Feature: engagement_score, Rank: 1
1002.3365378897391


#### ตัวอย่าง 7.3

In [None]:
# ตัวอย่าง 7.3
import pandas as pd
df = pd.read_csv('data/age_spend.csv')
df.head(2)

X = df[['age']]
y = df['spend']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y,
                                                 random_state=1)

# decision tree
from sklearn.tree import DecisionTreeRegressor
max2_tree_model = DecisionTreeRegressor(max_depth=2)
max2_tree_model.fit(X_train,y_train)
max5_tree_model = DecisionTreeRegressor(max_depth=5)
max5_tree_model.fit(X_train,y_train)

# linear regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

    # RMSE
from sklearn.metrics import mean_squared_error
linear_predictions = model.predict(X_test)
print('Linear model RMSE: '+
      str(mean_squared_error(linear_predictions, y_test)**0.5))
max2_tree_predictions = max2_tree_model.predict(X_test)
print('Tree with max depth of 2 RMSE: '+
      str(mean_squared_error(max2_tree_predictions, y_test)**0.5))
max5_tree_predictions = max5_tree_model.predict(X_test)
print('tree with max depth of 5 RMSE: '+
      str(mean_squared_error(max5_tree_predictions, y_test)**0.5))

    # scatterplot
import matplotlib.pyplot as plt
%matplotlib inline
fig, (ax1,ax2,ax3) = plt.subplots(1, 3, figsize=(18,5))
ages = pd.DataFrame({'age':range(18,70)})

ax1.scatter(X_test.age.tolist(), y_test.tolist())
ax1.plot(ages,model.predict(ages), color='r', 
         linewidth=5,label="Linear Regression")

ax2.scatter(X_test.age.tolist(), y_test.tolist())
ax2.plot(ages,max2_tree_model.predict(ages),color='g',
         linewidth=5,label="Tree with max depth 2")

ax3.scatter(X_test.age.tolist(), y_test.tolist())
ax3.plot(ages,max5_tree_model.predict(ages),color='k',
         linewidth=5, label="Tree with max depth 5")

plt.show();

# Random forest
from sklearn.ensemble import RandomForestRegressor
max2_forest_model = RandomForestRegressor(max_depth=2,
                                          random_state=1)
max2_forest_model.fit(X_train,y_train)
max5_forest_model = RandomForestRegressor(max_depth=5,
                                          random_state=1)
max5_forest_model.fit(X_train,y_train)

    # RMSE
max2_forest_predictions = max2_forest_model.predict(X_test)
print('Max depth of 2 RMSE: ' + 
      str(mean_squared_error(max2_forest_predictions, y_test)**0.5))
max5_forest_predictions = max5_forest_model.predict(X_test)
print('Max depth of 5 RMSE: ' + 
      str(mean_squared_error(max5_forest_predictions, y_test)**0.5))

    # scatterplot
fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(18,5))

ax1.scatter(X_test.age.tolist(), y_test.tolist(),color='gray')
ax1.plot(ages,max2_forest_model.predict(ages), color='c',
         linewidth=5, label="Forest with max depth 2")

ax2.scatter(X_test.age.tolist(), y_test.tolist(),color='gray')
ax2.plot(ages,max5_forest_model.predict(ages), color='m',
         linewidth=5, label="Forest with max depth 5")

plt.show();

# พล็อตเส้นกราฟทั้งหมด
plt.figure(figsize=(12,8))
plt.scatter(X_test.age.tolist(), y_test.tolist())
plt.plot(ages,model.predict(ages), color='r', linewidth=5, 
         label="Linear Regression")
plt.plot(ages,max2_tree_model.predict(ages), color='g',
         linewidth=5,label="Tree with max depth 2")
plt.plot(ages,max5_tree_model.predict(ages), color='k',
         linewidth=5, label="Tree with max depth 5")
plt.plot(ages,max2_forest_model.predict(ages), color='c',
         linewidth=5, label="Forest with max depth 2")
plt.plot(ages,max5_forest_model.predict(ages), color='m',
         linewidth=5, label="Forest with max depth 5")
plt.legend()
plt.xlabel("age")
plt.ylabel("spend")
plt.show()

#### กิจกรรม 7.3

In [3]:
# กิจกรรม 7.3
import pandas as pd
df = pd.read_csv('data/spend_age_income_ed.csv')
df.head()

X = df[['age','income','years_of_education']]
y = df['spend']

# train-test split 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X, y,
                                               random_state=1)

# models
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

from sklearn.tree import DecisionTreeRegressor
max2_tree_model = DecisionTreeRegressor(max_depth=2)
max2_tree_model.fit(X_train,y_train)
max5_tree_model = DecisionTreeRegressor(max_depth=5)
max5_tree_model.fit(X_train,y_train)

from sklearn.ensemble import RandomForestRegressor
max2_forest_model = RandomForestRegressor(max_depth=2,
                                          random_state=1)
max2_forest_model.fit(X_train,y_train)
max5_forest_model = RandomForestRegressor(max_depth=5,
                                          random_state=1)
max5_forest_model.fit(X_train,y_train)

# RMSE
from sklearn.metrics import mean_squared_error
linear_predictions = model.predict(X_test)
print('Linear model RMSE: ' + 
      str(mean_squared_error(linear_predictions,
                             y_test)**0.5))
max2_tree_predictions = max2_tree_model.predict(X_test)
print('Tree with max depth of 2 RMSE: ' + 
      str(mean_squared_error(max2_tree_predictions,
                             y_test)**0.5))
max5_tree_predictions = max5_tree_model.predict(X_test)
print('Tree with max depth of 5 RMSE: ' + 
      str(mean_squared_error(max5_tree_predictions,
                             y_test)**0.5))
max2_forest_predictions = max2_forest_model.predict(X_test)
print('Random Forest with max depth of 2 RMSE: ' + 
      str(mean_squared_error(max2_forest_predictions,
                             y_test)**0.5))
max5_forest_predictions = max5_forest_model.predict(X_test)
print('Random Forest with max depth of 5 RMSE: ' + 
      str(mean_squared_error(max5_forest_predictions,
                             y_test)**0.5))

Linear model RMSE: 353.258320016064
Tree with max depth of 2 RMSE: 269.4925790778443
Tree with max depth of 5 RMSE: 131.38090248043963
Random Forest with max depth of 2 RMSE: 267.815331131688
Random Forest with max depth of 5 RMSE: 122.76180483143743
