In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from scipy import stats
from scipy.stats import randint

# prep
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler


# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model

# Validation libraries
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error, precision_recall_curve, classification_report
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

#Naive bayes
from sklearn.naive_bayes import GaussianNB 

In [5]:
df = pd.read_csv("student_spending.csv")
df

Unnamed: 0.1,Unnamed: 0,age,gender,year_in_school,major,monthly_income,financial_aid,tuition,housing,food,transportation,books_supplies,entertainment,personal_care,technology,health_wellness,miscellaneous,preferred_payment_method
0,0,19,Non-binary,Freshman,Psychology,958,270,5939,709,296,123,188,41,78,134,127,72,Credit/Debit Card
1,1,24,Female,Junior,Economics,1006,875,4908,557,365,85,252,74,92,226,129,68,Credit/Debit Card
2,2,24,Non-binary,Junior,Economics,734,928,3051,666,220,137,99,130,23,239,112,133,Cash
3,3,23,Female,Senior,Computer Science,617,265,4935,652,289,114,223,99,30,163,105,55,Mobile Payment App
4,4,20,Female,Senior,Computer Science,810,522,3887,825,372,168,194,48,71,88,71,104,Credit/Debit Card
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,22,Female,Senior,Biology,1346,520,3688,969,152,194,151,42,38,252,65,163,Cash
996,996,19,Female,Senior,Biology,1407,560,3380,508,265,52,206,40,98,274,84,135,Mobile Payment App
997,997,20,Male,Junior,Economics,957,393,3497,723,339,139,69,112,46,284,57,28,Mobile Payment App
998,998,22,Non-binary,Senior,Economics,1174,612,3649,543,237,123,200,129,90,190,101,65,Mobile Payment App


In [6]:
df = df.drop("Unnamed: 0", axis =1)
df

Unnamed: 0,age,gender,year_in_school,major,monthly_income,financial_aid,tuition,housing,food,transportation,books_supplies,entertainment,personal_care,technology,health_wellness,miscellaneous,preferred_payment_method
0,19,Non-binary,Freshman,Psychology,958,270,5939,709,296,123,188,41,78,134,127,72,Credit/Debit Card
1,24,Female,Junior,Economics,1006,875,4908,557,365,85,252,74,92,226,129,68,Credit/Debit Card
2,24,Non-binary,Junior,Economics,734,928,3051,666,220,137,99,130,23,239,112,133,Cash
3,23,Female,Senior,Computer Science,617,265,4935,652,289,114,223,99,30,163,105,55,Mobile Payment App
4,20,Female,Senior,Computer Science,810,522,3887,825,372,168,194,48,71,88,71,104,Credit/Debit Card
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,22,Female,Senior,Biology,1346,520,3688,969,152,194,151,42,38,252,65,163,Cash
996,19,Female,Senior,Biology,1407,560,3380,508,265,52,206,40,98,274,84,135,Mobile Payment App
997,20,Male,Junior,Economics,957,393,3497,723,339,139,69,112,46,284,57,28,Mobile Payment App
998,22,Non-binary,Senior,Economics,1174,612,3649,543,237,123,200,129,90,190,101,65,Mobile Payment App


In [17]:
X = df.drop(columns=['preferred_payment_method','gender','year_in_school','major'])
y_value = df['preferred_payment_method']

In [18]:
X

Unnamed: 0,age,monthly_income,financial_aid,tuition,housing,food,transportation,books_supplies,entertainment,personal_care,technology,health_wellness,miscellaneous
0,19,958,270,5939,709,296,123,188,41,78,134,127,72
1,24,1006,875,4908,557,365,85,252,74,92,226,129,68
2,24,734,928,3051,666,220,137,99,130,23,239,112,133
3,23,617,265,4935,652,289,114,223,99,30,163,105,55
4,20,810,522,3887,825,372,168,194,48,71,88,71,104
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,22,1346,520,3688,969,152,194,151,42,38,252,65,163
996,19,1407,560,3380,508,265,52,206,40,98,274,84,135
997,20,957,393,3497,723,339,139,69,112,46,284,57,28
998,22,1174,612,3649,543,237,123,200,129,90,190,101,65


In [19]:
y_value

0       Credit/Debit Card
1       Credit/Debit Card
2                    Cash
3      Mobile Payment App
4       Credit/Debit Card
              ...        
995                  Cash
996    Mobile Payment App
997    Mobile Payment App
998    Mobile Payment App
999                  Cash
Name: preferred_payment_method, Length: 1000, dtype: object

In [26]:
y_value.unique()

array(['Credit/Debit Card', 'Cash', 'Mobile Payment App'], dtype=object)

In [15]:
X.columns

Index(['age', 'monthly_income', 'financial_aid', 'tuition', 'housing', 'food',
       'transportation', 'books_supplies', 'entertainment', 'personal_care',
       'technology', 'health_wellness', 'miscellaneous'],
      dtype='object')

In [21]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_value)
y

array([1, 1, 0, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 2, 1, 2,
       0, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 1,
       1, 1, 0, 2, 0, 2, 0, 1, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 1, 2, 2, 2, 0, 0, 2, 1, 0, 0, 1, 0, 0, 2, 2, 0, 2, 2, 1, 1, 1,
       2, 1, 1, 0, 2, 2, 1, 1, 1, 1, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 2, 1,
       2, 1, 1, 0, 2, 2, 0, 1, 1, 2, 2, 2, 0, 1, 2, 1, 0, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 0, 1, 0, 2, 0, 2, 0, 0, 0, 1, 1, 1, 2, 1, 0, 1, 2,
       0, 1, 1, 2, 1, 2, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 2, 2, 1, 0,
       2, 1, 0, 0, 2, 2, 1, 2, 1, 2, 1, 0, 0, 1, 2, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 2, 2, 0, 1, 0, 1, 2, 1, 1, 0, 2, 1, 2, 1, 1,
       2, 2, 1, 0, 1, 1, 2, 2, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 2, 2, 2, 0,
       0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 1, 2, 1, 1, 1, 1, 0, 1, 2, 2, 2, 2,
       1, 0, 2, 2, 2, 2, 1, 0, 0, 0, 2, 0, 1, 1, 2, 1, 2, 1, 1, 0, 0, 2,
       0, 2, 0, 0, 2, 2, 1, 1, 0, 2, 1, 1, 1, 2, 1,

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [42]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.6452850047253724


In [47]:


# # split the wave dataset into a training and a test set
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,test_size =0.3)

# instantiate the model and set the number of neighbors to consider to 3
reg = KNeighborsRegressor(n_neighbors=3)
# fit the model using the training data and training targets
reg.fit(X_train, y_train)

print("Training set R^2: {:.2f}".format(reg.score(X_train, y_train)))
print("Test set R^2: {:.2f}".format(reg.score(X_test, y_test)))

Training set R^2: 0.37
Test set R^2: -0.45


In [56]:

slr = linear_model.LinearRegression() #create an linear regression model objective 

slr.fit(X_train,y_train) # estimate the patameters
print('beta',slr.coef_)
print('alpha',slr.intercept_)

beta [-9.80013611e-03  4.70558331e-05  1.41871596e-04 -4.60434935e-05
  1.47782586e-04 -1.85997164e-04 -1.29622112e-03 -4.23754737e-04
 -1.36977738e-03 -5.11719596e-04 -1.72756399e-04 -7.60128947e-04
  8.30551429e-04]
alpha 1.6962058557945263


In [57]:


y_predict = slr.predict(X_test) # predict the Y based on the model
mean_squared_error = mean_squared_error(y_test,y_predict) # calculate mean square error
r2_score = r2_score(y_test,y_predict) #calculate r square

print ('mean square error:',mean_squared_error )
print ('r square:',r2_score )


mean square error: 0.6452850047253724
r square: -0.026869835654634855


In [84]:
grid = GridSearchCV(
        estimator=RandomForestClassifier(),
        param_grid={'n_estimators': [100, 200, 500], 
                    'max_depth': [3, 8]},
        cv=3, 
        scoring='roc_auc',
        n_jobs=-1)

In [85]:
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.compose import make_column_transformer
from sklearn import set_config
set_config(display='diagram')


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


keep_features = ['radius_mean', 'texture_mean', 'smoothness_mean', 'compactness_mean']

X_proc = make_pipeline(StandardScaler())
pipeline_features = ('features', FeatureUnion([('xp', X_proc)]))

pipe = Pipeline([pipeline_features, ("estimator",grid)])
pipe.fit(X_train, y_train)



In [90]:
keep_features

['radius_mean', 'texture_mean', 'smoothness_mean', 'compactness_mean']

In [89]:
df.head(2)


Unnamed: 0,age,gender,year_in_school,major,monthly_income,financial_aid,tuition,housing,food,transportation,books_supplies,entertainment,personal_care,technology,health_wellness,miscellaneous,preferred_payment_method
0,19,Non-binary,Freshman,Psychology,958,270,5939,709,296,123,188,41,78,134,127,72,Credit/Debit Card
1,24,Female,Junior,Economics,1006,875,4908,557,365,85,252,74,92,226,129,68,Credit/Debit Card


In [91]:
import pickle

pickle.dump(pipe, open('pipe.pkl', 'wb'))