**Common imports**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

**Simple Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression


In [None]:
performance_data = pd.read_csv("../input/students-performance-in-exams/StudentsPerformance.csv")
performance_data.head() 

In [None]:
X = performance_data[["reading score"]]  # [[]] ensures we return the right data structure for sklearn 
y = performance_data[["math score"]]

Predicting maths score from reading score 

In [None]:
lr = LinearRegression()
lr.fit(X, y)
math_predictions = lr.predict(X)

plt.plot(X, y, "o", alpha=0.4)  # Alpha determnes how transparent the points are. 
plt.plot(X, math_predictions)
plt.show()

In [None]:
print("slope: ", lr.coef_)
print("intercept: ", lr.intercept_)
print("R squared: ", lr.score(X, y))

**Multiple Regression**

In [None]:
streeteasy = pd.read_csv("../input/streeteasy/streeteasy.csv")
streeteasy.head()

In [None]:
x = streeteasy[['bedrooms', 'bathrooms', 'size_sqft', 'min_to_subway', 'floor', 'building_age_yrs', 'no_fee', 'has_roofdeck', 'has_washer_dryer', 'has_doorman', 'has_elevator', 'has_dishwasher', 'has_patio', 'has_gym']]
y = streeteasy[['rent']]
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state=6)

In [None]:
mlr = LinearRegression()
model = mlr.fit(x_train, y_train)
y_predict = mlr.predict(x_test)
print("R squared: ", mlr.score(x_test, y_test))

In [None]:
plt.scatter(y_test, y_predict, alpha = 0.4)
plt.plot(range(20000), range(20000))  # Creates a line y=x

plt.xlabel("Prices: $Y_i$")
plt.ylabel("Predicted prices: $\hat{Y}_i$")
plt.title("Actual Rent vs Predicted Rent")

plt.show()

In [None]:
zoe_apartment = [[1, 1, 620, 16, 1, 98, 0, 0, 1, 0, 0, 0, 1, 0]]
predict = mlr.predict(zoe_apartment)
print("Predicted rent: ", predict)

**K-Nearest Neighbors Classification**

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier 

In [None]:
breast_cancer_data = load_breast_cancer()
print(breast_cancer_data.data[0])
print(breast_cancer_data.feature_names)

In [None]:
print(breast_cancer_data.target)
print(breast_cancer_data.target_names)

In [None]:
training_data, validation_data, training_labels, validation_labels = train_test_split(breast_cancer_data.data, breast_cancer_data.target, test_size = 0.2, random_state = 100)

In [None]:
k_list = range(1, 101)
accuracies = []
for k in k_list:
    classifier = KNeighborsClassifier(n_neighbors = k)
    classifier.fit(training_data, training_labels)
    accuracies.append(classifier.score(validation_data, validation_labels))

plt.plot(k_list, accuracies)
plt.xlabel("k")
plt.ylabel("Validation Accuracy")
plt.title("Breast Cancer Classifier Accuracy")
plt.show
    
print("Max accuracy: ", max(accuracies))



**Logistic Regression (with comparison to K-Nearest Neighbors)**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing 

Working with the same data as the K-Nearest Neighbors Classifier above. Sklearn's Logistic Regresion implementation requires feature data to be normalized:

In [None]:
scaler = preprocessing.StandardScaler().fit(breast_cancer_data.data)
breast_cancer_data_scaled = scaler.transform(breast_cancer_data.data)
print(breast_cancer_data_scaled.mean(axis=0))
print(breast_cancer_data_scaled.std(axis=0))

In [None]:
training_data, validation_data, training_labels, validation_labels = train_test_split(breast_cancer_data_scaled, breast_cancer_data.target, test_size = 0.2, random_state = 100)

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(training_data, training_labels)
print(logistic_model.coef_)

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(training_data, training_labels)
coefficients = logistic_model.coef_.tolist()[0]
print(coefficients)

We need to make this list easier to interpret. We can begin by matching each coefficient with its feature name. 

In [None]:
matched_coefficients = {}
for i in range(len(coefficients)):
    matched_coefficients[breast_cancer_data.feature_names[i]] = coefficients[i]
print(matched_coefficients)

We can define the most important features as those that have a coefficient with an absolute value greater than 0.9 (this is a somewhat arbitrary choice based on eyeballing the output above). 

In [None]:
most_important_coefficients = {}  
for feature_name, coefficient in matched_coefficients.items():
    if abs(coefficient) > 0.9:
        most_important_coefficients[feature_name] = coefficient
print(most_important_coefficients)

In [None]:
plt.bar(most_important_coefficients.keys(), most_important_coefficients.values())
plt.title('Most important features')
plt.xlabel('Feature')
plt.ylabel('Coefficient')
plt.show()

In this data, a malignant classification takes a value of 0, so a negative coefficient increases the odds of the sample being malignant. 

In [None]:
print("Score:", logistic_model.score(validation_data, validation_labels))

This does slightly better than K-Nearest Neighbors Classifier above (which got a max accuracy of 0.965). When the lives of cancer patients are at stake, this is a big difference. Furthermore, the added interpretability of logistic regression outputs means it is probably better in this case, since doctors will want to know why a particular patient has received a malignant classification (that is, which features the model considers important).