In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer, accuracy_score, classification_report



from datacleaner import autoclean
import klib
from sklearn.feature_selection import mutual_info_regression
from joblib import dump, load
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from imblearn.over_sampling import SMOTE


from tabulate import tabulate
import missingno as msno
import warnings
warnings.filterwarnings('ignore')



from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor

In [3]:
df = pd.DataFrame({
  'status': ['yes','no'],
})
df

Unnamed: 0,status
0,yes
1,no


non numerical rows, rows that contain other than nums (just one column)

In [None]:
non_num = df.loc[pd.to_numeric(df['column_name'], errors='coerce').isna(), 'column name']

All rows of all columns with non numerical datas, rows

In [None]:
non_num_rows = df[~df.applymap(lambda x: pd.to_numeric(x, errors='coerce')).notna().all(axis=1)]

Removing strings from rows of numerical columns


In [None]:
df['column name'] = df['column name'].astype(str).str.extract(r'([0-9.]+)')
df['column name'] = pd.to_numeric(df['column name'], errors='coerce')

Frequency encoding (for one column)

In [None]:
freq = df['column name'].value_counts()
df['column name'] = df['column name'].map(freq)

For all colunms


In [None]:
cat_cols = df.select_dtypes(exclude='number').columns
for col in cat_cols:
  cardinality = df[col].nunique()
  if cardinality > 50:
    freq = df[col].value_counts()
    df[col] = df[col].map(freq)


Checking the skewness of columns before scaling to apply right scaling technique

In [None]:
for col in df.select_dtypes(include='number').columns:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

After checking symmetry and skewness of columns, separating them into two lists to apply standardscaler to one and robustscaler to another

In [None]:
# enter first list column names manually

normal_dist_cols = ['column name', 'column name', 'column name', 'column name']

all_cols = df.columns.to_list()

skewed_cols = [col for col in all_cols if col not in normal_dist_cols]

Skewness using skew() function and giving threshold 0.5 and -0.5

In [None]:
skewness = df.skew(numeric_only=True)
normal_cols = skewness[abs(skewness) <= 0.5].index.to_list()
normal_cols = skewness[abs(skewness) > 0.5].index.to_list()

Math Feature transforming using np.log1p()

In [None]:
df[skewed_cols] = df[skewed_cols].apply(np.log1p)

Feature transforming (Date into year / month / day)

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

pd.to_datetime --> month / day / year
if the data has day at first such as 30.12.2020 you should use dayfirst = True

Mutual info ni python function orqali yaratish

In [None]:
def mtop_scores(df, target_column, top_n=9):
    
    x = df.copy()
    y = x.pop(target_column)
    
    mi_scores = mutual_info_regression(x, y)
    mi_scores_df = pd.DataFrame(mi_scores, index=x.columns, columns=['Mutual Info'])
    mi_scores_df = mi_scores_df.sort_values(by='Mutual Info', ascending=False)

    return mi_scores_df.head(top_n)

top_scores = mtop_scores(df, 'price', top_n=9)
top_scores

Pipeline tuzish

In [None]:
OneHotEncoder = 1


num_col = df.select_dtypes(include='number').columns
cat_col = df.select_dtypes(exclude='number').columns

numerical_features = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='mean')),
  ('scaler', StandardScaler())
])

categorical_features = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='most_frequent')),
  ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
  ('num', numerical_features, num_col),
  ('cat', categorical_features, cat_col)
])

model = RandomForestClassifier()

Pipeline = Pipeline(steps=[
  ('preprocessor', preprocessor),
  ('model', model)
])

KFoldni ishlatish

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_score = cross_val_score(model, x, y, cv=kf, scoring='neg_mean_squared_error')
cv_score = np.sqrt(-cv_score)

print(np.mean(cv_score))
print(np.std(cv_score))

Klib da missing valuelarni ko'rish

In [None]:
klib.missingval_plot(df)

Baselineni ishlatish 

In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error

dummy = DummyRegressor(strategy='mean')
model = DecisionTreeClassifier(max_depth=4, criterion='gini')

mse = mean_squared_error(y_test, y_pred, squared=False)

plot_tree orqali tree model orqasini ko'rish (classification uchun)

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(20,10))
plot_tree(model, feature_names=x.columns, class_names=['Good', 'Bad'], filled=True)
plt.show()

matplotlib plt orqali ikkita featureni aloqasini ko'rish mean bilan

In [None]:
import matplotlib.pyplot as plt

gender_by_age = df.groupby('Sex')['Age'].mean().reset_index()


plt.figure(figsize=(10,8))
plt.bar(gender_by_age['Sex'], gender_by_age['Age'], color='skyblue')
plt.xlabel('Gender')
plt.ylabel('Age')
plt.title("Yoshga ko'ra jins taqsimoti")
plt.show()

Seaborn orqali ikkita feature orasidagi aloqani mean bilan ko'rish

In [None]:
import seaborn as sns

gender_by_age = df.groupby('Sex')['Age'].mean().reset_index()

plt.figure(figsize=(10,8))
sns.barplot(data=gender_by_age, x='Sex', y='Age', palette='Reds')
plt.xlabel('Age')
plt.ylabel('Gender')
plt.title("Yoshga ko'ra jins taqsimoti")
plt.show()

Plotly express orqali ikkita feature orasidagi aloqani mean bilan ko'rish

In [None]:
import plotly.express as px

gender_by_age = df.groupby('Sex')['Age'].mean().reset_index()

# bargraph
# pio.renderers.default='browser'
fig = px.bar(gender_by_age, x='Sex', y='Age', title="Yoshga ko'ra jins taqsimoti", labels={'Sex': 'Jins', 'Age': 'Yosh'}, color='Sex')
fig.show()

# piechart
fig = px.pie(gender_by_age, names='Sex', values='Age', title="Yoshga ko'ra jins taqsimoti", labels={'Sex': 'Jins', 'Age': 'Yosh'}, color='Sex')
fig.show()

Checking whether you have installed a certain library with a certain keyword

In [None]:
import sys
'pandas' in sys.modules

Joblib orqali dump va load

In [None]:
from joblib import dump, load

model = DecisionTreeClassifier()
dump(model, 'model_name.joblib')
model = load('model_name.joblib')

GridSearchCV bilan hyperparameter tuning

In [None]:
param_grid = {
  'max_depth': [3,5,10,None],
  'min_samples_split': [2,5,10],
  'min_samples_leaf': [1,2,5],
  'max_features': ['auto', 'sqrt', 'log2', None]
}

grid_search = GridSearchCV(
  estimator=model,
  param_grid=param_grid,
  cv=5,
  n_jobs=-1,
  scoring='accuracy'
)

print("Best parameters:", grid_search.best_params_)
print('Best model:', grid_search.best_estimator_)

RandomizedSearchCV bilan hyperparameter tuning

In [None]:
params = {
  'max_depth': [3,5,7,None],
  'min_samples_split': [2,5,10],
  'min_samples_leaf': [1,3,5],
  'max_features': ['auto', 'log2', 'sqrt', None]
}

random_search = RandomizedSearchCV(
  estimator = DecisionTreeRegressor(),
  param_distributions = params,
  n_iter = 10,
  cv=5,
  n_jobs = -1,
  random_state = 42
)

Confusion matrix ni ishlatish

In [None]:
cm = confusion_matrix(y_test, y_pred)

class_names = ['target class name', 'target class name']

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
disp.plot(cmap=plt.cm.Blues)
plt.title('Error Analysis')
plt.show()

Saving files as csv

In [None]:
# usually x_test/y_test for name
x_test.to_csv('name.csv', index=False)
y_test.to_csv('name.csv', index=False)

Renaming column names

In [None]:
df.rename(columns={'Height (cm)': 'Height', 'Weight (kg)': 'Weight'}, inplace=True)

How to make lowercase the values of columns (eg: Male --> male)

In [None]:
df['column name'] = df['column name'].str.lower().str.strip()
df['column name'] = df['column name'].str.lower().str.strip()

To make a categotical column numerical

In [None]:
df['column_name'] = pd.to_numeric(df['column_name'], errors='coerce')

plt va sns orqali missing value larni ko'rish

In [None]:
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=missing.values, y=missing.index, palette='viridis')
plt.title('Missing values by features')
plt.xlabel('Number of missing values')
plt.ylabel('Feature')
plt.show()

plotly.express orqali piechartda missing value larni ko'rish

In [None]:
value_counts = df['Square Footage'].isnull().value_counts().rename(index={True: 'Missing', False: 'Not Missing'}).reset_index()
value_counts.columns = ['Status', 'Count']

px.pie(value_counts, names='Status', values='Count', title='Missing vs Not Missing - Age')

Regression modelda actual vs predicted valuelarni visual ko'rish (confusion matrix faqat classificationda ishlab, regressionda ishlamagani sababli)

In [None]:
sns.scatterplot(x=y_val, y=y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted")
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--')
plt.show()