In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import statsmodels.api       as sm

PATH = "../datasets/"
CSV_DATA = "winequality.csv"

dataset  = pd.read_csv(PATH + CSV_DATA,
                       skiprows=1,       # Don't include header row as part of data.
                       encoding = "ISO-8859-1", sep=',',
                       names=('fixed acidity', 'volatile acidity', 'citric acid',
                              'residual sugar', 'chlorides', 'free sulfur dioxide',
                              'total sulfur dioxide', 'density', 'pH', 'sulphates',
                              'alcohol', 'quality'))
# Show all columns.
pd.set_option('display.max_columns', None)

# Increase number of columns that display on one line.
pd.set_option('display.width', 1000)
print(dataset.head())
print(dataset.describe())
X = dataset[['volatile acidity',
             'chlorides', 'total sulfur dioxide', 'sulphates','alcohol']]

# Adding an intercept *** This is required ***. Don't forget this step.
# The intercept centers the error residuals around zero
# which helps to avoid over-fitting.
X = sm.add_constant(X)
y = dataset['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = sm.OLS(y_train, X_train).fit()
predictions = model.predict(X_test) # make the predictions by the model
print(model.summary())
print('Root Mean Squared Error:',
      np.sqrt(metrics.mean_squared_error(y_test, predictions)))

###########################################################
print("\nStochastic Gradient Descent")
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

# Stochastic gradient descent models are sensitive to differences
# in scale so a StandardScaler is usually used.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)  # Don't cheat - fit only on training data
X_trainScaled = scaler.transform(X_train)
X_testScaled  = scaler.transform(X_test)

# SkLearn SGD classifier
sgd = SGDRegressor(verbose=1)
sgd.fit(X_trainScaled, y_train)
predictions = sgd.predict(X_testScaled)
print('Root Mean Squared Error:',
      np.sqrt(mean_squared_error(y_test, predictions)))

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  alcohol  quality
0            7.4              0.70         0.00             1.9      0.076                 11.0                  34.0   0.9978  3.51       0.56      9.4        5
1            7.8              0.88         0.00             2.6      0.098                 25.0                  67.0   0.9968  3.20       0.68      9.8        5
2            7.8              0.76         0.04             2.3      0.092                 15.0                  54.0   0.9970  3.26       0.65      9.8        5
3           11.2              0.28         0.56             1.9      0.075                 17.0                  60.0   0.9980  3.16       0.58      9.8        6
4            7.4              0.70         0.00             1.9      0.076                 11.0                  34.0   0.9978  3.51       0.56      9.4        5
       fixed acidity  volati

  x = pd.concat(x[::order], 1)


In [4]:
import pandas  as pd
import numpy   as np
from   sklearn.model_selection import train_test_split
from   sklearn.linear_model    import LogisticRegression
from   sklearn                 import metrics

# Setup data.
candidates = {'gmat': [780,750,690,710,680,730,690,720,
                       740,690,610,690,710,680,770,610,580,650,540,590,620,
                       600,550,550,570,670,660,580,650,660,640,620,660,660,
                       680,650,670,580,590,690],
              'gpa': [4,3.9,3.3,3.7,3.9,3.7,2.3,3.3,
                      3.3,1.7,2.7,3.7,3.7,3.3,3.3,3,2.7,3.7,2.7,2.3,
                      3.3,2,2.3,2.7,3,3.3,3.7,2.3,3.7,3.3,3,2.7,4,
                      3.3,3.3,2.3,2.7,3.3,1.7,3.7],
              'work_experience': [3,4,3,5,4,6,1,4,5,
                                  1,3,5,6,4,3,1,4,6,2,3,2,1,4,1,2,6,4,2,6,5,1,2,4,6,
                                  5,1,2,1,4,5],
              'admitted': [1,1,1,1,1,1,0,1,1,0,0,1,
                           1,1,1,0,0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0,
                           0,0,1]}

df = pd.DataFrame(candidates,columns= ['gmat', 'gpa',
                                       'work_experience','admitted'])
print(df)

# Separate into x and y values.
X = df[['gmat', 'gpa','work_experience']]
y = df['admitted']

# Import the necessary libraries first
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Show chi-square scores for each feature.
# There is 1-degree freedom since 1 predictor during feature evaluation.
# Generally, >=3.8 is good)
test      = SelectKBest(score_func=chi2, k=3)
chiScores = test.fit(X, y) # Summarize scores
np.set_printoptions(precision=3)
print("\nPredictor Chi-Square Scores: " + str(chiScores.scores_))

# Re-assign X with significant columns only after chi-square test.
X = df[['gmat', 'work_experience']]

# Split data.
X_train,X_test,y_train,y_test = train_test_split(
    X, y, test_size=0.25,random_state=0)

# Perform logistic regression.
logisticModel = LogisticRegression(fit_intercept=True, random_state = 0,
                                   solver='liblinear')

# Stochastic gradient descent models are sensitive to differences
# in scale so a StandardScaler is usually used.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_trainScaled = scaler.transform(X_train)
X_testScaled  = scaler.transform(X_test)

logisticModel.fit(X_trainScaled,y_train)
y_pred=logisticModel.predict(X_testScaled)

# Show model coefficients and intercept.
print("\nModel Coefficients: ")
print("\nIntercept: ")
print(logisticModel.intercept_)

print(logisticModel.coef_)

# Show confusion matrix and accuracy scores.
confusion_matrix = pd.crosstab(y_test, y_pred,
                               rownames=['Actual'],
                               colnames=['Predicted'])

print('\nAccuracy: ',metrics.accuracy_score(y_test, y_pred))
print("\nConfusion Matrix")
print(confusion_matrix)

print("\nStochastic Gradient Descent")
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_trainScaled, y_train)

y_pred = clf.predict(X_testScaled)

# Show confusion matrix and accuracy scores.
confusion_matrix = pd.crosstab(y_test, y_pred,
                               rownames=['Actual'],
                               colnames=['Predicted'])

print('\nAccuracy: ',metrics.accuracy_score(y_test, y_pred))
print("\nConfusion Matrix")
print(confusion_matrix)

    gmat  gpa  work_experience  admitted
0    780  4.0                3         1
1    750  3.9                4         1
2    690  3.3                3         1
3    710  3.7                5         1
4    680  3.9                4         1
5    730  3.7                6         1
6    690  2.3                1         0
7    720  3.3                4         1
8    740  3.3                5         1
9    690  1.7                1         0
10   610  2.7                3         0
11   690  3.7                5         1
12   710  3.7                6         1
13   680  3.3                4         1
14   770  3.3                3         1
15   610  3.0                1         0
16   580  2.7                4         0
17   650  3.7                6         1
18   540  2.7                2         0
19   590  2.3                3         0
20   620  3.3                2         0
21   600  2.0                1         0
22   550  2.3                4         0
23   550  2.7   

In [6]:
weights = [0.5, 2.3, 2.9]
heights = [1.4, 1.9, 3.2]

def getRes(weights, heights, intercept):
    sum  = 0
    BETA = 0.64
    for i in range(0, len(weights)):
        sum+= -2*(heights[i] - intercept - BETA*weights[i])

    print("Intercept: " + str(intercept) + " Res: " + str(round(sum,2)) )

intercept = 0.95
getRes(weights, heights, intercept)

Intercept: 0.95 Res: -0.0
