In [7]:
# Pandas is used for data manipulation
import pandas as pd
from   sklearn.metrics import mean_squared_error

# Read in data and display first 5 rows
features = pd.read_csv('../datasets/temperatures.csv')

# Show all columns.
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print(features)

# One-hot encode the data using pandas get_dummies
features = pd.get_dummies(features)

# Display the first 5 rows of the last 12 columns.
print(features.head(5))

# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict
labels = np.array(features['actual'])

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)

print('Accuracy:', round(accuracy, 2), '%.')

# Print out the mean square error.
mse = mean_squared_error(test_labels, predictions)
print('RMSE:', np.sqrt(mse))

# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
for pair in feature_importances:
    print('Variable: {:20} Importance: {}'.format(*pair))

# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Extract the two most important features
important_indices = [feature_list.index('temp_1'), feature_list.index('average')]
train_important = train_features[:, important_indices]
test_important = test_features[:, important_indices]

# Train the random forest
rf_most_important.fit(train_important, train_labels)

# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)
errors = abs(predictions - test_labels)

# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / test_labels))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)

print("Best parrameters")
print(rf_random.best_params_)

print("After hyperparamater tuning:")
rf_best = RandomForestRegressor(n_estimators=rf_random.best_params_['n_estimators'], min_samples_split=rf_random.best_params_['min_samples_split'],
                                min_samples_leaf=rf_random.best_params_['min_samples_leaf'], max_features=rf_random.best_params_['max_features'],
                                max_depth=rf_random.best_params_['max_depth'], bootstrap=rf_random.best_params_['bootstrap'])

rf_best.fit(train_important, train_labels)

# Make predictions and determine the error
predictions = rf_best.predict(test_important)
errors = abs(predictions - test_labels)

# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / test_labels))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')
# Print out the mean square error.
mse = mean_squared_error(test_labels, predictions)
print('RMSE:', np.sqrt(mse))

     year  month  day   week  temp_2  temp_1  average  actual  forecast_noaa  forecast_acc  forecast_under  friend
0    2016      1    1    Fri      45      45     45.6      45             43            50              44      29
1    2016      1    2    Sat      44      45     45.7      44             41            50              44      61
2    2016      1    3    Sun      45      44     45.8      41             43            46              47      56
3    2016      1    4    Mon      44      41     45.9      40             44            48              46      53
4    2016      1    5   Tues      41      40     46.0      44             46            46              46      41
..    ...    ...  ...    ...     ...     ...      ...     ...            ...           ...             ...     ...
343  2016     12   27   Tues      42      42     45.2      47             41            50              47      47
344  2016     12   28    Wed      42      47     45.3      48             41    

In [3]:
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
iris = datasets.load_iris()

# Creating a DataFrame of given iris dataset.
import pandas as pd
data=pd.DataFrame({
    'sepal length':iris.data[:,0],
    'sepal width':iris.data[:,1],
    'petal length':iris.data[:,2],
    'petal width':iris.data[:,3],
    'species':iris.target
})

print(data.head())

# Import train_test_split function
from sklearn.model_selection import train_test_split
X=data[['sepal length', 'sepal width', 'petal length', 'petal width']]  # Features
y=data['species']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
rf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=rf.predict(X_test)
rf.fit(X_train, y_train)

y_pred=rf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

# Predict species for a single flower.
# sepal length = 3, sepal width = 5
# petal length = 4, petal width = 2
prediction = rf.predict([[3, 5, 4, 2]])
# 'setosa', 'versicolor', 'virginica'
print(prediction)

# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
for pair in feature_importances:
    print('Variable: {:20} Importance: {}'.format(*pair))

# New random forest with only the two most important variables
rf_most_important = RandomForestClassifier(n_estimators=100)

# Extract the two most important features
important_cols = [feature_importances[x][0] for x in range(0, 2)]
train_important = X_train[important_cols]
test_important = X_test[important_cols]

# Train the random forest
rf_most_important.fit(train_important, y_train)

# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)
errors = abs(predictions - y_test)

import numpy as np

# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / y_test))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')


   sepal length  sepal width  petal length  petal width  species
0           5.1          3.5           1.4          0.2        0
1           4.9          3.0           1.4          0.2        0
2           4.7          3.2           1.3          0.2        0
3           4.6          3.1           1.5          0.2        0
4           5.0          3.6           1.4          0.2        0
Accuracy: 0.9777777777777777
[2]
Variable: petal length         Importance: 0.46
Variable: petal width          Importance: 0.4
Variable: sepal length         Importance: 0.12
Variable: sepal width          Importance: 0.02
Mean Absolute Error: 0.02 degrees.
Accuracy: 96.77 %.




In [12]:
from sklearn import datasets
iris = datasets.load_iris()

# Creating a DataFrame of given iris dataset.
import pandas as pd
data=pd.DataFrame({
    'sepal length':iris.data[:,0],
    'sepal width':iris.data[:,1],
    'petal length':iris.data[:,2],
    'petal width':iris.data[:,3],
    'species':iris.target
})

print(data.head())

# Import train_test_split function
from sklearn.model_selection import train_test_split
X=data[['sepal length', 'sepal width', 'petal length', 'petal width']]  # Features
y=data['species']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

from sklearn.preprocessing import StandardScaler
sc_x            = StandardScaler()
X_train_scaled  = sc_x.fit_transform(X_train)
X_test_scaled = sc_x.transform(X_test)

from sklearn              import metrics
from sklearn.ensemble     import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

def buildModelAndPredict(clf, X_train_scaled, X_test_scaled, y_train, y_test, title):
    print("\n**** " + title)
    #Train the model using the training sets y_pred=rf.predict(X_test)
    clf_fit = clf.fit(X_train_scaled,y_train)
    y_pred = clf_fit.predict(X_test_scaled)

    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

    # For explanation see:
    # https://towardsdatascience.com/multi-class-metrics-made-simple-part-i-precision-and-recall-9250280bddc2
    print(metrics.classification_report(y_test, y_pred, digits=3))

    # Predict species for a single flower.
    # sepal length = 3, sepal width = 5
    # petal length = 4, petal width = 2
    prediction = clf_fit.predict([[3, 5, 4, 2]])

    # 'setosa', 'versicolor', 'virginica'
    print(prediction)

lr = LogisticRegression(fit_intercept=True, solver='liblinear')
buildModelAndPredict(lr, X_train_scaled, X_test_scaled, y_train, y_test, "Logistic Regression")

rf = RandomForestClassifier(n_estimators=200, max_features=3)
buildModelAndPredict(rf, X_train_scaled, X_test_scaled, y_train, y_test, "Random Forest Regressor")

   sepal length  sepal width  petal length  petal width  species
0           5.1          3.5           1.4          0.2        0
1           4.9          3.0           1.4          0.2        0
2           4.7          3.2           1.3          0.2        0
3           4.6          3.1           1.5          0.2        0
4           5.0          3.6           1.4          0.2        0

**** Logistic Regression
Accuracy: 0.9333333333333333
              precision    recall  f1-score   support

           0      1.000     1.000     1.000        15
           1      1.000     0.800     0.889        15
           2      0.833     1.000     0.909        15

    accuracy                          0.933        45
   macro avg      0.944     0.933     0.933        45
weighted avg      0.944     0.933     0.933        45

[2]

**** Random Forest Regressor
Accuracy: 0.9333333333333333
              precision    recall  f1-score   support

           0      1.000     1.000     1.000        15
  