# Testing Logistic regression on a dataset with and without duplicates 

### Generating a synthetic dataset (no duplicates)

In [1]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

no duplicates example

In [2]:
X, y =make_classification(n_samples=2000, n_features=100, n_informative=30, n_redundant=70,
                    n_classes=10, n_clusters_per_class=1, random_state=12345)

In [3]:
df_X = pd.DataFrame(X, columns=list(range(1,101)))

In [4]:
df_y = y

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.25, train_size = 0.75, stratify = df_y, random_state = 12345)

In [6]:
model=RandomForestClassifier(random_state = 12345)

In [7]:
model.fit(X_train, y_train)

In [8]:
y_pred=model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.66      0.65        50
           1       0.80      0.76      0.78        49
           2       0.67      0.73      0.70        49
           3       0.54      0.57      0.55        51
           4       0.73      0.66      0.69        50
           5       0.66      0.78      0.72        50
           6       0.71      0.80      0.75        50
           7       0.86      0.63      0.73        49
           8       0.81      0.69      0.74        51
           9       0.70      0.76      0.73        51

    accuracy                           0.70       500
   macro avg       0.71      0.70      0.71       500
weighted avg       0.71      0.70      0.70       500



### Generating a synthetic dataset with datasets

In [9]:
X, y = make_classification(n_samples=2000, n_features=100, n_informative=30, n_redundant=70,
                    n_classes=10, n_clusters_per_class=1, random_state=12345)

In [10]:
df = pd.DataFrame(X, columns=list(range(1,101)))

In [11]:
df['class'] = y

In [12]:
df_first_50 = df.iloc[:50]

In [13]:
df_first_50.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,92,93,94,95,96,97,98,99,100,class
0,-7.399716,7.139481,-0.258633,-5.053021,2.484434,8.082464,3.369002,-3.101034,-19.057729,5.630944,...,6.649029,-0.482377,1.298302,-2.65618,0.646447,1.036012,-10.46901,-8.686394,-8.277129,3
1,-16.639485,-12.280839,15.260885,-9.317742,-0.377278,-4.797686,6.463178,-5.874903,-18.849125,11.835292,...,-1.781684,-2.253119,-0.022432,-6.90991,14.263716,7.515207,5.022184,0.457173,3.230131,1
2,15.355822,7.565411,-3.145851,-11.92682,6.477332,-13.99577,4.344364,-8.961307,36.083471,-4.712912,...,-10.437824,3.7976,4.103066,-1.581752,-25.85594,4.321968,-4.201655,1.252183,1.875502,2
3,1.746259,9.655583,9.094123,17.289277,-0.257277,5.686839,-11.267928,-2.296214,-35.580863,17.459162,...,4.224824,-2.29914,-7.205907,-5.62209,7.383644,-8.95478,-17.825772,-13.395244,-7.968702,2
4,-27.427293,-19.028408,-2.699808,-5.219483,3.698449,15.59612,-14.883694,20.082762,-11.150458,25.706732,...,1.545061,4.019271,-5.605206,-0.719066,-6.391155,-13.718956,31.175127,-23.006289,22.915178,6


In [14]:
df_repeated = pd.concat([df, pd.concat([df_first_50]*10)], ignore_index=True)

In [15]:
df_X = df_repeated.drop('class', axis = 1)

In [16]:
df_y = df_repeated['class']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.25, train_size = 0.75, stratify = df_y, random_state = 12345)

In [18]:
model.fit(X_train, y_train)

In [19]:
y_pred=model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.74      0.79        62
           1       0.81      0.82      0.81        72
           2       0.86      0.85      0.86        67
           3       0.79      0.85      0.81        65
           4       0.57      0.72      0.63        53
           5       0.82      0.73      0.77        67
           6       0.84      0.73      0.78        63
           7       0.82      0.78      0.80        54
           8       0.80      0.83      0.81        66
           9       0.73      0.77      0.75        56

    accuracy                           0.78       625
   macro avg       0.79      0.78      0.78       625
weighted avg       0.79      0.78      0.79       625

