# AI - 1st month


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Data Exploration

In [None]:
df = pd.read_csv('airline_passenger_satisfaction.csv')
df.info() 
df.head()

In [None]:
df1 = pd.read_csv('IMDB_Movies_Dataset.csv')
df1.info()
df1.head()

# Data Preprocessing - missing values

In [None]:
df.isnull().any()
df1.isnull().sum() # check for missing values


In [None]:
# drop the columns that are not needed
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df1.drop(['Unnamed: 0'], axis = 1, inplace = True)
df.drop(['id'], axis = 1, inplace = True)

In [None]:
df.dropna(inplace = True) # drop rows with missing values
df.dropna(subset=['Gender'], inplace=True) # drop rows with missing values in 'Gender' column

In [None]:
# fill missing values in 'Age' column with 25
df['Age'].fillna(25, inplace=True)


# fill missing values in 'Gender' column with 'Male'
df['Gender'].fillna('Male', inplace=True)

In [None]:
# fillinf missing values with the mean of the column
df['Flight Distance'].fillna(df['Flight Distance'].mean(), inplace=True)


# fill missing values with the mode of the column and the mean of the column using a loop

for col in df.columns:
    if df[col].isnull().any():
        if df[col].dtype == 'object':
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].mean(), inplace = True)

In [None]:
for col in df1.columns:
    if df1[col].isnull().any():
        if df1[col].dtype == 'object':
            df1[col].fillna(df1[col].mode()[0], inplace=True)
        else:
            df1[col].fillna(df1[col].mean(), inplace = True)


# feature transformation - removing speacial characters

In [None]:
df1['Metascore'].replace('N/A', np.nan, inplace = True) # replace 'N/A' with NaN using inplace parameter
df1['Metascore'] = df1['Metascore'].replace('N/A', np.nan) # replace 'N/A' with NaN when not using the inplace parameter 

In [None]:
df1['Worldwide Gross'] = df1['Worldwide Gross'].replace({'\$': '', ',': ''}, regex=True) # remove $ and , from the Gross column

df1['Budget'] = df1['Budget'].replace(r'[^\d.]', '', regex=True) # remove all non-numeric characters from the Budget column

df1['Worldwide Gross'] = df1['Worldwide Gross'].replace(r'\s*\([^)]*\)', '', regex=True) # remove all text in parentheses from the Worldwide Gross column

df1['Languages'] = df1['Languages'].str.split(',').str[0] # split the Language column by comma and take the first element
df1['Languages'] = df1['Languages'].str.strip() # remove leading and trailing spaces

In [None]:
# removing all non-numeric characters from the Budget column

df1['Budget'] = df1['Budget'].replace(r'[^\d.]', '', regex=True)
df1['Budget'] 

In [None]:
df['Gender'].unique() # check unique values in the Gender column and it returns an array 
df['Gender'].value_counts() # count the number of each unique value in the Gender column and it returns a series
df['Gender'].nunique() # count the number of unique values in the Gender column and it returns an integer

# mapping the column

In [None]:
df['satisfaction'] = df['satisfaction'].map({
    'neutral or dissatisfied': 0, 
    'satisfied': 1})

# Klib vs autoclean

In [None]:
from klib import data_cleaning
from datacleaner import autoclean

#df = data_cleaning(df) # clean the data using klib
#df1 = autoclean(df1) # clean the data using datacleaner

# feature engineering - creating new features   

In [None]:
# create a new column

df['Age Group'] = pd.cut(df['Age'], bins=[0, 18, 30, 40, 50, 60, 70, 80, 90], labels=['0-18', '18-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90'])

# use lambda function to create a new column
df['Age Group'] = df['Age'].apply(lambda x: '0-18' if x <= 18 else ('18-30' if x <= 30 else ('30-40' if x <= 40 else ('40-50' if x <= 50 else ('50-60' if x <= 60 else ('60-70' if x <= 70 else ('70-80' if x <= 80 else '80-90')))))))
 


In [None]:
df.head()

# Time Based Transformation

In [None]:
df_1 = pd.read_csv('IMDB_Movies_Dataset.csv')
df_1[['Date', 'Country']] = df_1['Release Date'].str.extract(r'(.+?)\s*\((.+)\)')


df_1['Date'] = pd.to_datetime(df_1['Date'], errors='coerce')

df_1['Year'] = df_1['Date'].dt.year
df_1['Month'] = df_1['Date'].dt.month_name()
df_1['Day'] = df_1['Date'].dt.day

df_1['Year']

# Data Visualization

    proportion of categories in the circular bar plot

In [None]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist() # select categorical columns
num_cols = df.select_dtypes(include=['number']).columns.tolist() # select numerical columns
plt.figure(figsize = (12,6))
colors = sns.color_palette('pastel')[0:5]
for i,cols in enumerate(cat_cols):
    plt.subplot(2,3,1+i)
    df[cols].value_counts().plot(kind='pie',autopct='%0.1f')
    plt.title(cols,color='red')
    plt.xlabel('')
    plt.ylabel('')
plt.tight_layout()
plt.show()

# for col in df.columns:
#     if df[col].dtype == 'object':
#         plt.title(col)
#         plt.pie(df[col].value_counts(), labels=df[col].unique(), autopct='%1.1f%%')
#         plt.show()



    Countplot comparing the values of the categorical column

In [None]:
# comparing the number of categories of the column in bar plot

plt.figure(figsize = (12,6))
colors = sns.color_palette('pastel')[0:5]
for i,cols in enumerate(cat_cols):
    plt.subplot(2,3,1+i)
    sns.countplot(x=cols, data=df)
    plt.title(cols,color='blue')
    plt.title(cols,color='red')
plt.tight_layout()
plt.show()

# for col in df.columns:
#     if df[col].dtype == 'object':
#         sns.countplot(x=col, data=df)
#         plt.show()

    Number of occurancies in a categorical variable - countplot

In [None]:
# hue = 'satisfaction' to show the satisfaction level in the bar plot


sns.countplot(x = 'Class', hue = 'satisfaction', palette = "YlOrBr", data = df) # 
plt.title('Class vs Satisfaction')
plt.show()

    Distribution of numerical variables in the dataset - histoplot

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.histplot(df, x = "Age Group", hue = "Customer Type", multiple='stack' , palette = "YlOrBr", edgecolor = ".3", linewidth = .5)
plt.show()

    Distribution of numerical variables in the dataset histogram - histogram

In [None]:
# Distribution of numerical variables in the dataset histogram
sns.set(style="whitegrid")

plt.figure(figsize=(10, 6))
plt.hist(df['Age'], bins=20, color='blue', alpha=0.7)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.show()


# sns.set(style="whitegrid")
# plt.figure(figsize = (15,12))
# for i, cols in enumerate(num_cols):
#     plt.subplot(3,3,1+i)
#     sns.histplot(data = df,x = cols,kde = True)
# plt.tight_layout()
# plt.show()

    Distribution, median, outliers across categories - boxplot

In [None]:
sns.boxplot(x = 'satisfaction', y = 'Age', data = df, palette = "YlOrBr")
plt.title('Age vs Satisfaction')
plt.show()


    The relationship between two continuous variables - scatter plot

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df1['Budget'], df1['Worldwide Gross'], alpha=0.5)
plt.title('Budget vs Worldwide Gross')
plt.xlabel('Budget')
plt.ylabel('Worldwide Gross')
plt.show()

    Using plotly


In [None]:
import plotly.express as px
px.histogram(df, x = 'Age', color = 'satisfaction', title = 'Age vs Satisfaction', marginal = 'box', hover_data = df.columns)

    Correlation heatmap

# Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

encoder = LabelEncoder()
encoder1 = OrdinalEncoder()

cat_cols = df.select_dtypes(include=['object']).columns # select categorical columns
num_cols = df.select_dtypes(include=['number']).columns # select numerical columns

for col in cat_cols:
    cardinality = df[col].nunique()
    if cardinality <= 4:
        # one-hot encoding
        dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
        # drop the original column and merge the dummy columns
        df = pd.concat([df.drop(col, axis=1), dummies], axis=1)
    elif 10 > cardinality > 4:
        # Apply label encoding
        df[col] = encoder.fit_transform(df[col])
    elif cardinality > 10:
        # Apply ordinal encoding for columns with more than 10 unique values
        df[col] = encoder1.fit_transform(df[[col]])

In [None]:

cat_cols1 = df1.select_dtypes(include=['object']).columns.tolist() # select categorical columns
num_cols1 = df1.select_dtypes(include=['number']).columns.tolist() # select numerical columns

for col in cat_cols1:
    cardinality1 = df1[col].nunique()
    if cardinality1 <= 4:
        # one-hot encoding
        dummies = pd.get_dummies(df1[col], prefix=col, dtype=int)
        # drop the original column and merge the dummy columns
        df1 = pd.concat([df1.drop(col, axis=1), dummies], axis=1)
    elif 10 > cardinality1 > 4:
        # Apply label encoding
        df1[col] = encoder.fit_transform(df1[col])
   

    Correlation heatmap

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True,  cmap="coolwarm", linewidths=0.2, fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

# Scaling

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

scaler = StandardScaler()
scaler1 = MinMaxScaler()
scaler2 = RobustScaler()

cols = df.columns[df.columns != 'satisfaction'] # select all columns except the target column

df[cols] = scaler.fit_transform(df[cols])
df1[num_cols1] = scaler2.fit_transform(df1[num_cols1])

# Mutual Information

In [None]:
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

mi_scores = mutual_info_regression(df.drop('satisfaction', axis=1), df['satisfaction'], discrete_features=True)
mi_scores = pd.Series(mi_scores, index=df.drop('satisfaction', axis=1).columns)
mi_scores.sort_values(ascending=False, inplace=True)
mi_scores.plot.bar(figsize=(10, 6))
plt.title('Mutual Information Scores')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x=mi_scores[:5], y=mi_scores.index[:5])
plt.title('Top 5 Features by Mutual Information')
plt.show()

# Skewness

In [None]:
skewness = df[num_cols].skew()
skewness_df = pd.DataFrame(skewness, index=num_cols, columns=['Skewness'])
skewness_df = skewness_df.sort_values(by='Skewness', ascending=False)
skewness_df

    skewness transformation using boxcox

In [None]:
from scipy import stats
df['Departure Delay in Minutes'], lambda_value = stats.boxcox(df['Departure Delay in Minutes'] + 1) # boxcox transformation

# for col in num_cols:
#     df[col], lambda_value = stats.boxcox(df[col] + 1)

In [None]:
# for col in num_cols:
#   if 0.5 < skewness[col] < 1 or -1 < skewness[col] < -0.5:
#     df[col] = np.power(df[col], 1/2)
#   elif -2 < skewness[col] < -1 or 1 < skewness[col] < 2:
#     df[col] = np.power(df[col], 3/2)
#   elif skewness[col] < -2 or skewness[col] > 2:
#     df[col] = 1 / df[col] # or np.log1p(df[col])
  
# skewness_new = df[num_cols].skew().sort_values(ascending=False)
# print(skewness_new)


# Model training and evaluation

In [None]:
from sklearn.model_selection import train_test_split

x = df.drop('satisfaction', axis=1)
y = df['satisfaction']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression()

lg.fit(x_train, y_train)
y_pred = lg.predict(x_test)

In [None]:
accuracy_score_lg = lg.score(x_test, y_test)
print("Accuracy Score: ", accuracy_score_lg)


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

cm_lg = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# calculate the confustion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
true_positive = cm[1, 1]
true_negative = cm[0, 0]
false_positive = cm[0, 1]
false_negative = cm[1, 0]
accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
f1_score = 2 * (precision * recall) / (precision + recall)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1_score)

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)

In [None]:
accuracy_score_dt = dt.score(x_test, y_test)
print("Accuracy Score: ", accuracy_score_dt)
cm_dt = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

In [None]:
accuracy_score_rf = rf.score(x_test, y_test)
print("Accuracy Score: ", accuracy_score_rf)
cm_rf = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor


gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)
y_pred = gb.predict(x_test)

In [None]:
accuracy_score_gb = gb.score(x_test, y_test)
print("Accuracy Score: ", accuracy_score_gb)
cm_gb = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

In [None]:
from xgboost import XGBRegressor, XGBClassifier


xgb = XGBClassifier()
xgb.fit(x_train, y_train)
y_pred = xgb.predict(x_test)

In [None]:
accuracy_score_xgb = xgb.score(x_test, y_test)
print("Accuracy Score: ", accuracy_score_xgb)
cm_xgb = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

In [None]:
from lightgbm import LGBMRegressor, LGBMClassifier


lgb = LGBMClassifier()
lgb.fit(x_train, y_train)
y_pred = lgb.predict(x_test)

In [None]:
accuracy_score_lgb = lgb.score(x_test, y_test)
print("Accuracy Score: ", accuracy_score_lgb)
cm_lgb = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:
from tabulate import tabulate
 

data = [
    ["Logistic Regression", accuracy_score_lg, cm_lg['0']['precision'], cm_lg['0']['recall'], cm_lg['0']['f1-score']],
    ["Decision Tree Classifier", accuracy_score_dt, cm_dt['0']['precision'], cm_dt['0']['recall'], cm_dt['0']['f1-score']],
    ["Random Forest Classifier", accuracy_score_rf, cm_rf['0']['precision'], cm_rf['0']['recall'], cm_rf['0']['f1-score']],
    ["Gradient Boosting Classifier", accuracy_score_gb, cm_gb['0']['precision'], cm_gb['0']['recall'], cm_gb['0']['f1-score']],
    ["XGBoost Classifier", accuracy_score_xgb, cm_xgb['0']['precision'], cm_xgb['0']['recall'], cm_xgb['0']['f1-score']],
    ["LightGBM Classifier", accuracy_score_lgb, cm_lgb['0']['precision'], cm_lgb['0']['recall'], cm_lgb['0']['f1-score']],
]


headers = ['Model', 'accuracy score', 'Precision', 'Recall', 'F1-Score']
print(tabulate(data, headers=headers, tablefmt='grid')) 

# Hyperparameter tuning - GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


param_grid={
    'max_depth':[3,5,12,None],
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,2,5],
    'max_features':['auto','sqrt','log','None']

}

grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, # estimator is the model to be tuned
                           verbose=1, # show the progress of the grid search
                           n_jobs=-1, # use all processors
                           cv=5, # number of cross-validation folds
                           scoring='accuracy') # scoring metric to be used
                          
                          
grid_search.fit(x_train, y_train) # fit the grid search to the training data



In [None]:
best_model_dt = grid_search.best_estimator_ # get the best model from the grid search
y_pred=best_model_dt.predict(x_test) # predict the test data using the best model
accuracy_score_dt = best_model_dt.score(x_test, y_test)
print("Accuracy Score: ", accuracy_score_dt)


# Randomized Search

In [None]:
random_search = RandomizedSearchCV(
    estimator=dt,
    param_distributions=param_grid,
    n_iter=10, # number of iterations
    verbose=1,
    cv=5,
    n_jobs=-1,
    random_state=42,
)
random_search.fit(x_train, y_train) # fit the random search to the training data

In [None]:
random_model_dt = random_search.best_estimator_ # get the best model from the random search
y_pred=random_model_dt.predict(x_test) # predict the test data using the best model
accuracy_score_dt = random_model_dt.score(x_test, y_test)
print("Accuracy Score: ", accuracy_score_dt)


# Dictionary

In [None]:
students = {
  'Khusniddin': 19,
  '023': 20,
  'Maftuna': 18,
  'Abu': 20
}
students['Qutbiddin'] = 17
students['Maftuna'] = 19
print(students)


In [None]:
students.get('023', 0)# get the value of the key 'Khusniddin' if it exists, otherwise return 0

In [None]:
# creating a function that counts how mant times a word appears in a sentence and stores it in a dictionary

def word_count(sentence):
  words = sentence.split()

  word_counts = {}
  for word in words:
    if word in word_counts:
      word_counts[word] += 1
    else:
        word_counts[word] = 1
      # word_counts[word] = word_counts.get(word, 0) + 1
  return word_counts

sentence = '023 Qutbiddin 023 Maftuna 023 Khusniddin 023 Qutbiddin Khusniddin Saidabror'
result = word_count(sentence)
print(result)


In [None]:
# get the value of the key '023' if it exists, otherwise return 0

students.get('023', 0)

In [None]:
# create a function that swaps the keys and values of a dictionary

original = {1:'a', 2:'b', 3: 'c'}
def swap(x):
  y = {value: key for key, value in x.items()}
  return y
swap(original)

In [None]:
# creating a function that swaps the keys with values only if the values are strings

def swapp(x):
  y = {value: key for key, value in x.items() if isinstance(value, str)}
  return y
data = {1: 'apple', 2: 'banana', 3: 100, 4: 'cherry', 5: 200}
swapp(data)

# List

In [None]:
a=[1,2.4,'Hello','True']
type(a)

In [None]:
# adding to the list from the end
a.append('japan')


# removing from the list
a.remove('True')

# removing from the list by index
a.pop(2)

# length of the list
len(a)

# sorting the list
# a.sort()


# sorting the list in reverse order
# a.sort(reverse=True)

# inserting into the list
a.insert(2, 'Python')

# reversing the list
a.reverse()

# slicing the list
a[1:3] # returns [2.4, 'Python']
a[1:] # returns [2.4, 'Python', 'Hello', 'japan']
a[:3] # returns [1, 2.4, 'Python']
a[::2] # returns [1, 'Python', 'japan']
a[::-1] # returns ['japan', 'Hello', 'Python', 2.4, 1]

print(a)


# Tuple

In [None]:
# creating a tuple
fruits = ('apple', 'banana','cherry')
print(fruits)
type(fruits)

In [None]:
# concatenating two tuples
vegetables = ('cabbage', 'tomato', 'potato')
food = fruits + vegetables
print(food)


In [None]:
# unpacking a tuple
for item in food:
    print(item)
 

In [None]:
# adding to the tuple
fruits = fruits + ('orange',)

fruits_list = list(fruits)
fruits_list.append('strawberry')
fruits = tuple(fruits_list)
fruits

In [None]:
# removing from the tuple
list_fruits = list(fruits)
list_fruits.remove('orange')

# removing from the tuple by index
list_fruits.pop(2)
fruits = tuple(list_fruits)
print(fruits)

In [None]:

# finding the index in tuples
fruits = ('apple', 'banana', 'cherry', 'orange', 'strawberry')
if 'strawberry' in fruits:
  print(fruits.index('strawberry'))
else:
  print('The element is not found.')

In [None]:
# tuple of tuples
# printing the name of the second student
students = (("Alice", 20), ("Bob", 22), ("Charlie", 21))
students[1][0]