## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Data Cleaning

In [None]:
# csv
data = pd.read_csv('dataset/advertising.csv')
data = pd.read_csv('', na_values=[" ?"])
data = pd.read_csv('', skiprows= 5)
data = pd.read_csv('', delimiter='\t')
data = pd.read_csv('', nrows=100)
data = pd.read_csv('',  usecols= ['Item_Identifier','Item_Type',])

#excel
sheet_1985 = pd.read_excel('dataset/sales.xlsx',sheet_name='1985')
sheet_1985 = pd.read_excel('',sheet_name='1985')

#json
data = pd.read_json('datasets/simple.json')

In [None]:
data.head(), data.describe().round(2),  data.describe().T, 
data.info(), data.isnull().sum(), data.isnull().sum().sort_index().head()
data.shape, data.columns, data.dtypes

In [None]:
data.set_index('Item_Identifier',inplace=True, drop=True)

In [None]:
data.loc[data.Year == 1987]
data.loc[(data.Year == 2009) & (data.Size == 'Medium')]
data[(data.Year == 1987) | (data.Year == 1988) | (data.Year == 1999)]

# list of columns
select_columns = ['Item_Identifier', 'Item_MRP', 'Outlet_Establishment_Year', 'Outlet_Size']
data[select_columns]

In [None]:
# mapping (dictionary) to update values
mapping = {
    'Low Fat' : 'LF',
    'Regular' : 'R',
    'LF' : 'LF',
    'reg': 'R',
    'low fat' : 'LF'
}
data.Item_Fat_Content.map(mapping)
data.Item_Fat_Content = data.Item_Fat_Content.map(mapping)

In [None]:
# divide MRP by 74
data.Item_MRP.apply(lambda x: x/74)

def convert(price):
    price = price/74
    price = price + 1.28
    return price
data['MRP_USD'] = data.Item_MRP.apply(lambda x : convert(x))

In [None]:
# sort values
data_frame.sort_values(by=['grade'])
data_frame.sort_values(by=['grade','marks'],ascending=[True,False])

In [None]:
# fill missing data or use simple imputer used below
data = data.fillna(data.mean())
data['PRCP'] = data['PRCP'].fillna(data['PRCP'].mean())

In [None]:
data = data.drop(['Ad Topic Line', 'City', 'Country', 'Timestamp'], axis=1)
data = data.dropna(axis=0)

In [None]:
# new columns from existing columns
data['Total_Marks'] = data['math score']+data['reading score']+data['writing score']
data['Percentage'] = data['Total_Marks']/3

In [None]:
# imputer function to fill null values
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 30
        else:
            return 25
    else:
        return Age
data['Age'] = data[['Age','Pclass']].apply(impute_age, axis=1)

In [None]:
data = data.astype({"EmployeeName": 'string', "JobTitle": 'category'})
data['Snowfall']=pd.to_numeric(data['Snowfall'],errors='coerce')
data['Date']=pd.to_datetime(data['Date'])

In [None]:
data[data['Class'] == 0].Amount.describe()
data[data['Class'] == 1].Amount.describe()

In [None]:
data['Job'].value_counts().head()

In [None]:
data['SalStat']=data['SalStat'].map({' less than or equal to 50,000':0, ' greater than 50,000':1})

In [None]:
data['Email'].apply(lambda x : x.split('@')[1]).value_counts().head()
data['CC Exp Date'].apply(lambda x : x.split('/')[1]).value_counts().sort_index()

## Data Visualization

In [None]:
sns.set(style="darkgrid")
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['figure.figsize']=(10,10)

In [None]:
fig = plt.figure(figsize=(12,10)) # size of graph
sns.heatmap(data.corr(), cmap='RdBu', annot=True, fmt=".2f");

In [None]:
sns.lineplot(x="Item_Weight", y="Item_MRP",data=data);

In [None]:
sns.countplot(x='Clicked on Ad', data=data, palette="Set1", hue='Male').set(title="Gender based Clicking on Ad");

In [None]:
sns.barplot(x="Item_Type", y="Item_MRP", data=data)

In [None]:
sns.catplot(y='BasePay', col='Year', data=data);

In [None]:
sns.catplot(x="Outlet_Size", y="Item_Outlet_Sales", kind='swarm',data=data);
#kind='box', 'violin', 'boxen', 'point', 'bar'

In [None]:
sns.jointplot(x='Age',y='Daily Time Spent on Site',data=data, hue="Clicked on Ad");

In [None]:
sns.scatterplot(x='Area Income',y='Daily Time Spent on Site', hue='Clicked on Ad', data=data, palette='rocket');

In [None]:
sns.boxplot(x='SalStat', y='age', data=data).set(title="Receiving salary based on Age");

In [None]:
sns.histplot(x='fico',data= data, kde=True,bins = 20);

In [None]:
sns.lmplot(x='Length of Membership',y='Yearly Amount Spent',data=data).set(title="Effect of Length of Membership");

In [None]:
sns.displot(data["Purchase Price"], kde=False, bins=15);

In [None]:
sns.pairplot(data, hue='Clicked on Ad')

In [None]:
# subplots
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
sns.violinplot(x='Species',y='PetalLengthCm',data=data)
plt.subplot(2,2,2)
sns.violinplot(x='Species',y='PetalWidthCm',data=data)
plt.subplot(2,2,3)
sns.violinplot(x='Species',y='SepalLengthCm',data=data)
plt.subplot(2,2,4)
sns.violinplot(x='Species',y='SepalWidthCm',data=data)
plt.show()

In [None]:
# subplots
plt.figure(figsize=(15,6))
plt.subplots_adjust(top=1,bottom=0,right=1,left=0,hspace=0.25,wspace=0.5)
plt.subplot(131)
plt.title("Math score v/s Gender")
sns.barplot(x="gender",y='math score',data=data)
plt.subplot(132)
plt.title("Reading Score V/S Gender")
sns.barplot(x='gender',y='reading score',data=data)
plt.subplot(133)
plt.title("Writing Score V/S Gender")
sns.barplot(x='gender',y='writing score',data=data)
plt.show()

## Preprocessing Model

In [None]:
data = pd.get_dummies(data, drop_first=True)

In [None]:
# simple imputer function to fill missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [None]:
# encoding independent variables
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

# encoding dependent variables
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
# feature scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
numeric_features = ['Daily Time Spent on Site', 'Age', 'Area Income','Daily Internet Usage']
data[numeric_features] = scaler.fit_transform(data[numeric_features])
# x_train = ss.fit_transform(x_train)

scaler = MinMaxScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

In [None]:
# removing the visible outliers in scatterplot
df1.drop(df1[(df1['MinTemp'] < -15) & (df1['MaxTemp'] > 15)].index, inplace = True)
df1.drop(df1[(df1['MinTemp'] > 8) & (df1['MaxTemp'] < -15)].index, inplace = True)

In [None]:
# pca-dimensionality reduction to reduce noise & standardize noise using eigenvalues
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(df)
x_pca = pca.transform(df)

In [None]:
# lda-dimensionality reduction & maximising separation in multi-classes
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=2)
X_train = lda.fit_transform(X_train, y_train)

In [None]:
from sklearn.model_selection import train_test_split
drop_list = ['perimeter_mean','radius_mean','compactness_mean','area_worst']
x= x.drop(drop_list,axis=1)

X = data.drop(['Clicked on Ad'], axis=1)
y = data['Clicked on Ad']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Developing The Models

## Regression

#### Linear Regression

In [None]:
# works on any size of dataset, gives informations about relevance of features
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(x_train, y_train)
predictions = lm.predict(x_test)

#### Polynomial Regression

In [None]:
# works very well on non linear problems
# need to choose the right polynomial degree for a good bias/variance tradeoff
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X)
lin_reg_ = LinearRegression()
lin_reg_.fit(X_poly, y)

#### Support Vector Regression

In [None]:
# easily adaptable, works very well on non-linear problems, not biased by outliers
# compulsory to apply feature scaling
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X, y)

#### Decision Tree Regression

In [None]:
# works on both linear / nonlinear problems
# overfitting can easily occur
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X, y)

#### Random Forest Regression

In [None]:
# powerful and accurate
# overfitting can easily occur, need to choose the number of trees
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10, random_state=0)
model_rf.fit(X_train, Y_train)
regressor.predict([[6.5]])
predictions = model_rf.predict(X_test)

## Classification

#### Logistic Regression

In [None]:
# probabilistic approach, gives informations about statistical significance of features
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(random_state=0)
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

#### K Nearest Neighbours

In [None]:
# need to choose the number of neighbours k
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)

#### Support Vector Classifier

In [None]:
# high performance on nonlinear problems, not biased by outliers, not sensitive to overfitting
# not good choice for large number of features
from sklearn.svm import SVC
svc_model = SVC(kernel='rbf', random_state=0)# kernel='linear'
svc_model.fit(x_train,y_train)
predictions = svc_model.predict(x_test)

#### Naive Bayes

In [None]:
# efficient, not biased by outliers, works on nonlinear problems, probabilistic approach
# based on the assumption that features have same statistical relevance
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

#### Decision Tree Classifier

In [None]:
# interpretability, no need for feature scaling, works on both linear / nonlinear problems
# overfitting can easily occur
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='entropy', random_state=0)
dtree.fit(x_train,y_train)
predictions = dtree.predict(x_test)

#### Random Forest Classifier

In [None]:
# powerful and accurate, good performance on many problems,
# overfitting can easily occur, need to choose the number of trees
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
rf = rf.fit(x_train,y_train)
predictions = rf.predict(x_test)

## XGBoost

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

## Clustering

#### K Means Clustering

In [None]:
# choose number of clusters using the elbow method
from sklearn.cluster import KMeans
#elbow mwthod
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title(''), plt.xlabel(''), plt.ylabel(''), plt.show()

# choose optimum number of clusters
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
kmeans.fit_predict(df.drop('Private',axis=1))
kmeans.cluster_centers_
kmeans.labels_

# visualize clsuters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title(''), plt.xlabel(''), plt.ylabel(''), plt.legend(), plt.show()

#### Hierarchical Clustering

In [None]:
# finding optimal number of clusters using dendograms
import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'))
plt.title(''), plt.xlabel(''), plt.ylabel(''), plt.show()

# training the model
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
y_hc = hc.fit_predict(X)

# visualize clusters
plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title(''), plt.xlabel(''), plt.ylabel(''), plt.legend(), plt.show()

## Testing the Model

In [None]:
# classification models
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,predictions))
confusion_matrix(predictions, y_test)
accuracy_score(predictions, y_test)
# never place complete judgement on accuracy, focus on balance of accuracy, f1, and precison

In [None]:
# regression models
from sklearn.metrics import mean_squared_error
print(mean_squared_error(Y_test, y_pred_rf))
print(model_rf.score(X_test, Y_test))
rmse = (np.sqrt(mean_squared_error(y_test, predictions)))
print("RMSE is {}", format(rmse))

## Plotting the Results on Chart

#### Regression Chart

In [None]:
plt.scatter(X_train, y_train, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title(''), plt.xlabel(''), plt.ylabel(''), plt.show()

#### Classification Chart

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = sc.inverse_transform(X_train), y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 0.25),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title(''), plt.xlabel(''), plt.ylabel(''), plt.legend(), plt.show()

## Predicting the Test Dataset

In [None]:
test['price_range'] = mod.predict(test)
test.head()
test['Survived']=model

In [None]:
final = test[['id', 'price_range']].copy()
final.to_csv('mobile_prices.csv',index=False)