In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import statsmodels.api       as sm

PATH     = "../datasets/"
CSV_DATA = "winequality.csv"

dataset  = pd.read_csv(PATH + CSV_DATA,
                       skiprows=1,       # Don't include header row as part of data.
                       encoding = "ISO-8859-1", sep=',',
                       names=('fixed acidity', 'volatile acidity', 'citric acid',
                              'residual sugar', 'chlorides', 'free sulfur dioxide',
                              'total sulfur dioxide', 'density', 'pH', 'sulphates',
                              'alcohol', 'quality'))
# Show all columns.
pd.set_option('display.max_columns', None)

# Increase number of columns that display on one line.
pd.set_option('display.width', 1000)
print(dataset.head())
print(dataset.describe())

# Include only statistically significant columns.
X = dataset[['volatile acidity',
             'chlorides', 'total sulfur dioxide',
             'pH', 'sulphates','alcohol']]
y = dataset['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Stochastic gradient descent models are sensitive to differences
# in scale so a MinMax is usually used.
from sklearn.preprocessing import MinMaxScaler
scalerX = MinMaxScaler()
scalerX.fit(X_train)

# Build scaler for y.
scalerY = MinMaxScaler()
reshapedYtrain = np.array(y_train).reshape(-1,1)
scalerY.fit(reshapedYtrain)

# Scale X_train, X_test and y_train.
X_trainScaled = scalerX.transform(X_train)
X_testScaled  = scalerX.transform(X_test)
y_trainScaled = scalerY.transform(reshapedYtrain)

# Add constant to scaled data.
X_trainScaled = sm.add_constant(X_trainScaled)
X_testScaled  = sm.add_constant(X_testScaled)

#---------------------------------------------------------------
# Perform OLS regression.
model       = sm.OLS(y_trainScaled, X_trainScaled).fit()
predictions = model.predict(X_testScaled) # make the predictions by the model
print(model.summary())

# Convert predictions to unscaled predictions and compare with y_test.
unscaledPredictionsOLS = scalerY.inverse_transform(predictions.reshape(-1,1))
print('Root Mean Squared Error:',
      np.sqrt(mean_squared_error(y_test, unscaledPredictionsOLS)))

#---------------------------------------------------------------
# Perform Ridge regression.
print("\nRidge Regression")
from sklearn.linear_model import  Ridge
ridge_reg   = Ridge(solver='auto')
ridge_reg.fit(X_trainScaled, y_trainScaled)
predictions = ridge_reg.predict(X_testScaled)

# Convert predictions to unscaled predictions and compare with y_test.
unscaledPredictionsRidge = scalerY.inverse_transform(predictions.reshape(-1,1))
print('Root Mean Squared Error:',
      np.sqrt(mean_squared_error(y_test, unscaledPredictionsRidge)))

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  alcohol  quality
0            7.4              0.70         0.00             1.9      0.076                 11.0                  34.0   0.9978  3.51       0.56      9.4        5
1            7.8              0.88         0.00             2.6      0.098                 25.0                  67.0   0.9968  3.20       0.68      9.8        5
2            7.8              0.76         0.04             2.3      0.092                 15.0                  54.0   0.9970  3.26       0.65      9.8        5
3           11.2              0.28         0.56             1.9      0.075                 17.0                  60.0   0.9980  3.16       0.58      9.8        6
4            7.4              0.70         0.00             1.9      0.076                 11.0                  34.0   0.9978  3.51       0.56      9.4        5
       fixed acidity  volati

In [4]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import statsmodels.api       as sm

PATH = "../datasets/"
CSV_DATA = "USA_Housing.csv"

df  = pd.read_csv(PATH + CSV_DATA)
# Show all columns.
pd.set_option('display.max_columns', None)

# Increase number of columns that display on one line.
pd.set_option('display.width', 1000)

print(df.head())
print(df.describe())
X = df[['Avg. Area Income', 'Avg. Area House Age',  'Avg. Area Number of Rooms', 'Area Population']]

# Adding an intercept *** This is required ***. Don't forget this step.
# The intercept centers the error residuals around zero
# which helps to avoid over-fitting.
X = sm.add_constant(X)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.preprocessing import MinMaxScaler
scalerX = MinMaxScaler()
scalerX.fit(X_train)

# Build scaler for y.
scalerY = MinMaxScaler()
reshapedYtrain = np.array(y_train).reshape(-1,1)
scalerY.fit(reshapedYtrain)

# Scale X_train, X_test and y_train.
X_trainScaled = scalerX.transform(X_train)
X_testScaled  = scalerX.transform(X_test)
y_trainScaled = scalerY.transform(reshapedYtrain)

# Add constant to scaled data.
X_trainScaled = sm.add_constant(X_trainScaled)
X_testScaled  = sm.add_constant(X_testScaled)

#---------------------------------------------------------------
# Perform OLS regression.
model       = sm.OLS(y_trainScaled, X_trainScaled).fit()
predictions = model.predict(X_testScaled) # make the predictions by the model
print(model.summary())

# Convert predictions to unscaled predictions and compare with y_test.
unscaledPredictionsOLS = scalerY.inverse_transform(predictions.reshape(-1,1))
print('Root Mean Squared Error:',
      np.sqrt(mean_squared_error(y_test, unscaledPredictionsOLS)))

#---------------------------------------------------------------
# Perform Ridge regression.
print("\nRidge Regression")
from sklearn.linear_model import  Ridge
ridge_reg   = Ridge(solver='auto')
ridge_reg.fit(X_trainScaled, y_trainScaled)
predictions = ridge_reg.predict(X_testScaled)

# Convert predictions to unscaled predictions and compare with y_test.
unscaledPredictionsRidge = scalerY.inverse_transform(predictions.reshape(-1,1))
print('Root Mean Squared Error:',
      np.sqrt(mean_squared_error(y_test, unscaledPredictionsRidge)))

   Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  Avg. Area Number of Bedrooms  Area Population         Price                                            Address
0      79545.458574             5.682861                   7.009188                          4.09     23086.800503  1.059034e+06  208 Michael Ferry Apt. 674\nLaurabury, NE 3701...
1      79248.642455             6.002900                   6.730821                          3.09     40173.072174  1.505891e+06  188 Johnson Views Suite 079\nLake Kathleen, CA...
2      61287.067179             5.865890                   8.512727                          5.13     36882.159400  1.058988e+06  9127 Elizabeth Stravenue\nDanieltown, WI 06482...
3      63345.240046             7.188236                   5.586729                          3.26     34310.242831  1.260617e+06                          USS Barnett\nFPO AP 44820
4      59982.197226             5.040555                   7.839388                          4.23   

  x = pd.concat(x[::order], 1)


In [5]:
import pandas  as pd
import numpy   as np
from   sklearn.model_selection import train_test_split
from   sklearn.linear_model    import LogisticRegression
from   sklearn                 import metrics

# Setup data.
candidates = {'gmat': [780,750,690,710,680,730,690,720,
                       740,690,610,690,710,680,770,610,580,650,540,590,620,
                       600,550,550,570,670,660,580,650,660,640,620,660,660,
                       680,650,670,580,590,690],
              'gpa': [4,3.9,3.3,3.7,3.9,3.7,2.3,3.3,
                      3.3,1.7,2.7,3.7,3.7,3.3,3.3,3,2.7,3.7,2.7,2.3,
                      3.3,2,2.3,2.7,3,3.3,3.7,2.3,3.7,3.3,3,2.7,4,
                      3.3,3.3,2.3,2.7,3.3,1.7,3.7],
              'work_experience': [3,4,3,5,4,6,1,4,5,
                                  1,3,5,6,4,3,1,4,6,2,3,2,1,4,1,2,6,4,2,6,5,1,2,4,6,
                                  5,1,2,1,4,5],
              'admitted': [1,1,1,1,1,1,0,1,1,0,0,1,
                           1,1,1,0,0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0,
                           0,0,1]}

df = pd.DataFrame(candidates,columns= ['gmat', 'gpa',
                                       'work_experience','admitted'])
print(df)

# Separate into x and y values.
X = df[['gmat', 'gpa','work_experience']]
y = df['admitted']

# Import the necessary libraries first
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Show chi-square scores for each feature.
# There is 1-degree freedom since 1 predictor during feature evaluation.
# Generally, >=3.8 is good)
test      = SelectKBest(score_func=chi2, k=3)
chiScores = test.fit(X, y) # Summarize scores
np.set_printoptions(precision=3)
print("\nPredictor Chi-Square Scores: " + str(chiScores.scores_))

# Re-assign X with significant columns only after chi-square test.
X = df[['gmat', 'work_experience']]

# Split data.
X_train,X_test,y_train,y_test = train_test_split(
    X, y, test_size=0.25,random_state=0)

# Perform logistic regression.
logisticModel = LogisticRegression(fit_intercept=True, random_state = 0,
                                   solver='liblinear')

# Stochastic gradient descent models are sensitive to differences
# in scale so a StandardScaler is usually used.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_trainScaled = scaler.transform(X_train)
X_testScaled  = scaler.transform(X_test)

print("\nRidge Classifier")
from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier(solver='auto')
clf.fit(X_trainScaled, y_train)

y_pred = clf.predict(X_testScaled)
print(y_pred)

# Show confusion matrix and accuracy scores.
confusion_matrix = pd.crosstab(y_test, y_pred,
                               rownames=['Actual'],
                               colnames=['Predicted'])

print('\nAccuracy: ',metrics.accuracy_score(y_test, y_pred))
print("\nConfusion Matrix")
print(confusion_matrix)

    gmat  gpa  work_experience  admitted
0    780  4.0                3         1
1    750  3.9                4         1
2    690  3.3                3         1
3    710  3.7                5         1
4    680  3.9                4         1
5    730  3.7                6         1
6    690  2.3                1         0
7    720  3.3                4         1
8    740  3.3                5         1
9    690  1.7                1         0
10   610  2.7                3         0
11   690  3.7                5         1
12   710  3.7                6         1
13   680  3.3                4         1
14   770  3.3                3         1
15   610  3.0                1         0
16   580  2.7                4         0
17   650  3.7                6         1
18   540  2.7                2         0
19   590  2.3                3         0
20   620  3.3                2         0
21   600  2.0                1         0
22   550  2.3                4         0
23   550  2.7   

In [9]:
import pandas  as pd
import numpy   as np
from   sklearn.model_selection import train_test_split
from   sklearn.linear_model    import LogisticRegression
from   sklearn                 import metrics
import statsmodels.api        as sm

PATH = "../datasets/"
FILE  = 'heart_disease.csv'
df = pd.read_csv(PATH + FILE)
print(df)

# Separate into x and y values.
X = df[['age',
        'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang',
        'oldpeak', 'slope', 'ca', 'thal']]
y = df['target']

# Import the necessary libraries first
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Show chi-square scores for each feature.
# There is 1-degree freedom since 1 predictor during feature evaluation.
# Generally, >=3.8 is good)
test      = SelectKBest(score_func=chi2, k=3)
chiScores = test.fit(X, y) # Summarize scores
np.set_printoptions(precision=3)
print("\nPredictor Chi-Square Scores: " + str(chiScores.scores_))

# Re-assign X with significant columns only after chi-square test.
X = df[['age',
        'sex', 'cp', 'trestbps', 'chol', 'fbs',  'thalach', 'exang',
        'oldpeak', 'slope', 'ca', 'thal']]

# Split data.
X_train,X_test,y_train,y_test = train_test_split(
    X, y, test_size=0.25,random_state=0)

# Stochastic gradient descent models are sensitive to differences
# in scale so a StandardScaler is usually used.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_trainScaled = scaler.transform(X_train)
X_testScaled  = scaler.transform(X_test)
X_trainScaled = sm.add_constant(X_trainScaled)
X_testScaled  = sm.add_constant(X_testScaled)

print("\nLogistic  Regression")
# Perform logistic regression.
logisticModel = LogisticRegression(fit_intercept=True, random_state = 0,
                                   solver='liblinear')

logisticModel.fit(X_trainScaled, y_train)
y_pred = logisticModel.predict(X_testScaled)

# Show confusion matrix and accuracy scores.
confusion_matrix = pd.crosstab(y_test, y_pred,
                               rownames=['Actual'],
                               colnames=['Predicted'])
print('\nAccuracy: ', metrics.accuracy_score(y_test, y_pred))
print("\nConfusion Matrix")
print(confusion_matrix)

print("\nRidge Classifier")
from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier(solver='auto')
clf.fit(X_trainScaled, y_train)

y_pred = clf.predict(X_testScaled)

# Show confusion matrix and accuracy scores.
confusion_matrix = pd.crosstab(y_test, y_pred,
                               rownames=['Actual'],
                               colnames=['Predicted'])

print('\nAccuracy: ', metrics.accuracy_score(y_test, y_pred))
print("\nConfusion Matrix")
print(confusion_matrix)

# Pickling
import pickle
# save model to file
pickle.dump(clf, open("myRidgeModel.dat", "wb"))

# load model from file
loaded_model = pickle.load(open("myRidgeModel.dat", "rb"))
print("----- Using Pickle File -----")
ridgePredictions = loaded_model.predict(X_testScaled)
print(ridgePredictions)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  ca  thal  target
0     63    1   3       145   233    1        0      150      0      2.3      0   0     1       1
1     37    1   2       130   250    0        1      187      0      3.5      0   0     2       1
2     41    0   1       130   204    0        0      172      0      1.4      2   0     2       1
3     56    1   1       120   236    0        1      178      0      0.8      2   0     2       1
4     57    0   0       120   354    0        1      163      1      0.6      2   0     2       1
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...    ...  ..   ...     ...
293   57    0   0       140   241    0        1      123      1      0.2      1   0     3       0
294   45    1   3       110   264    0        1      132      0      1.2      1   0     3       0
295   68    1   0       144   193    1        1      141      0      3.4      1   2     3       0
296   57    1   0   