<a href="https://colab.research.google.com/github/rohitpan/datasciencecoursera/blob/master/DGN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the dataset
df = pd.read_csv('./dga_data.csv')  # Replace with the correct path to your CSV file
df = df.dropna(subset=['domain'])  # Remove rows where 'domain' is NaN
# Step 2: Preprocess the data
df['isDGA'] = df['isDGA'].apply(lambda x: 1 if x == 'dga' else 0)  # Convert 'dga' to 1 and 'legit' to 0

# Step 3: Generate TF-IDF features
tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5))  # Character-level 3-5 ngrams
X = tfidf_vectorizer.fit_transform(df['domain'])
y = df['isDGA']

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train a Machine Learning Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 6: Evaluate the Model
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))




In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import TruncatedSVD

# Step 1: Load the dataset
df = pd.read_csv('./dga_data.csv')  # Replace with the correct path to your CSV file

# Step 2: Preprocess the data
df = df.dropna(subset=['domain'])  # Remove rows where 'domain' is NaN
df['isDGA'] = df['isDGA'].apply(lambda x: 1 if x == 'dga' else 0)  # Convert 'dga' to 1 and 'legit' to 0

# Step 3: Generate TF-IDF features
tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=5000)  # Limit to 5000 features
X = tfidf_vectorizer.fit_transform(df['domain'])
y = df['isDGA']

# Optional: Dimensionality Reduction
svd = TruncatedSVD(n_components=100, random_state=42)
X = svd.fit_transform(X)

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train a Machine Learning Model
model = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

# Step 6: Evaluate the Model
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.81603125
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.79      0.81     16123
           1       0.80      0.84      0.82     15877

    accuracy                           0.82     32000
   macro avg       0.82      0.82      0.82     32000
weighted avg       0.82      0.82      0.82     32000



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv('./dga_data.csv')

# Preprocess the data
df = df.dropna(subset=['domain'])
df['isDGA'] = df['isDGA'].apply(lambda x: 1 if x == 'dga' else 0)

# Generate TF-IDF features with different n-gram range
tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 6), max_features=5000)
X = tfidf_vectorizer.fit_transform(df['domain'])
y = df['isDGA']

# Optional: Dimensionality Reduction
svd = TruncatedSVD(n_components=100, random_state=42)
X = svd.fit_transform(X)


# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter Tuning with Grid Search
param_grid = {
    'n_estimators': [50, 75],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Use the best model from Grid Search
model = grid_search.best_estimator_

# Evaluate the Model
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


KeyboardInterrupt: 