# Data Loading, Cleaning and Preprocessing

In [None]:
__author__ = "Kabir Jaiswal"

In [None]:
data=pd.read_csv("../input/enarm-20012019/ENARM_clean.csv")
df=pd.DataFrame(data)
df.head(10)

In [None]:
df.rename({'estado': 'region', 'universidad': 'university', 'concursantes':'contestants',
           'seleccionados':'selected','promedio':'average_marks','año':'year'}, axis=1, inplace=True)
df.head()

In [None]:
df.describe()

In [None]:
del df["region"],df["year"]

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
class ColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns 

    def fit(self,X,y=None):
        return self 

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
df=ColumnLabelEncoder(columns = ['university']).fit_transform(df)

In [None]:
df.head(50)

In [None]:
from sklearn.model_selection import train_test_split
data = df.values
X, y = data[:, :-1], data[:, -1]

In [None]:
print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Anaomaly Detection with Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error
model = LinearRegression()
model.fit(X_train, y_train)
yhat = model.predict(X_test)
mae = mean_absolute_error(y_test, yhat)
print('MAE:' + str(mae))

# Anaomaly Detection with Minimum Covariance Determinant

In [None]:
from sklearn.covariance import EllipticEnvelope
data = df.values
X, y = data[:, :-1], data[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
model = LinearRegression()
model.fit(X_train, y_train)
yhat = model.predict(X_test)
mae = mean_absolute_error(y_test, yhat)
print('MAE:' + str(mae))

In [None]:
pred_vals=[]
for i in range(990):
    indvi=X_train[i]
    indvi = np.array(indvi)
    indvi = np.expand_dims(indvi, 0)
    a=model.predict(indvi)
    pred_vals.append(a[0])
for i in range(495):
    indvi=X_test[i]
    indvi = np.array(indvi)
    indvi = np.expand_dims(indvi, 0)
    a=model.predict(indvi)
    pred_vals.append(a[0])
pred_vals

In [None]:
Q25 = np.percentile(pred_vals, 25)
Q75 = np.percentile(pred_vals, 75)
IQR = Q75 - Q25

In [None]:
mean=np.mean(pred_vals)

In [None]:
cheated=[]
not_cheated=[]
for i in range(len(pred_vals)):
    if pred_vals[i] >  Q75  + 3.0 * IQR or  pred_vals[i] < Q25 - 3.0 * IQR:
        cheated.append(i)
    else:
        not_cheated.append(i)

In [None]:
len(cheated),len(not_cheated)

# Results

In [None]:
import matplotlib.pyplot as plt
from scipy.stats import norm
import statistics
x_axis = cheated
mean = statistics.mean(x_axis)
sd = statistics.stdev(x_axis)
  
plt.title("Normal Distribution for Cheated")
plt.plot(x_axis, norm.pdf(x_axis, mean, sd))
plt.show()

In [None]:
import matplotlib.pyplot as plt
from scipy.stats import norm
import statistics
x_axis = not_cheated
mean = statistics.mean(x_axis)
sd = statistics.stdev(x_axis)
  
plt.title("Not Normal Distribution for Cheated")
plt.plot(x_axis, norm.pdf(x_axis, mean, sd))
plt.show()

# Name of Faculties That cheated and the years

In [None]:
data=pd.read_csv("../input/enarm-20012019/ENARM_clean.csv")
df=pd.DataFrame(data)
df.rename({'estado': 'region', 'universidad': 'university', 'concursantes':'contestants',
           'seleccionados':'selected','promedio':'average_marks','año':'year'}, axis=1, inplace=True)
print("The Names of Faculties which cheated"+"\n")
for i in range(len(cheated)):
    print(df["university"][cheated[i]],df["year"][cheated[i]])